167 files changed, 30998 insertions, 2882 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index baeb7b0015..fa9a3a4bf4 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -21,6 +21,7 @@
 
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
 # Copyright (c) 2012 by Delphix. All rights reserved.
 #
@@ -1145,8 +1146,13 @@ PIPE_OBJS +=	pipe.o
 HSFS_OBJS +=	hsfs_node.o	hsfs_subr.o	hsfs_vfsops.o	hsfs_vnops.o \
 		hsfs_susp.o	hsfs_rrip.o	hsfs_susp_subr.o
 
+HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \
+		hyprlofs_vnops.o hyprlofs_vfsops.o
+
 LOFS_OBJS +=	lofs_subr.o	lofs_vfsops.o	lofs_vnops.o
 
+LXPROC_OBJS +=	lxpr_subr.o	lxpr_vfsops.o	lxpr_vnops.o
+
 NAMEFS_OBJS +=	namevfs.o	namevno.o
 
 NFS_OBJS +=	nfs_client.o	nfs_common.o	nfs_dump.o \
@@ -1385,6 +1391,7 @@ ZFS_COMMON_OBJS +=		\
 	zfs_fuid.o		\
 	zfs_sa.o		\
 	zfs_znode.o		\
+	zfs_zone.o		\
 	zil.o			\
 	zio.o			\
 	zio_checksum.o		\
@@ -1739,6 +1746,8 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \
 	    ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \
 	    ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o
 
+IPD_OBJS += ipd.o
+
 IBD_OBJS +=	ibd.o ibd_cm.o
 
 EIBNX_OBJS +=	enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \
@@ -1995,7 +2004,12 @@ MEGA_SAS_OBJS = megaraid_sas.o
 #
 #	MR_SAS module
 #
-MR_SAS_OBJS = mr_sas.o
+MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o
+
+#
+#	DR_SAS module
+#
+DR_SAS_OBJS = dr_sas.o
 
 #
 #	ISCSI_INITIATOR module
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 28f7ddefae..27478a210d 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -21,6 +21,10 @@
 
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+#
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
 #
 
@@ -242,10 +246,18 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/hsfs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/hyprlofs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/lofs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/lxproc/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/mntfs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -505,6 +517,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/inet/ipf/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/inet/ipd/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(COMMONBASE)/net/patricia/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -717,6 +733,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/drm/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/dr_sas/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/efe/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1744,9 +1764,15 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/fifofs/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/hsfs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/hyprlofs/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/lofs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/lxproc/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/mntfs/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -1891,6 +1917,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/inet/ipnet/%.c
 $(LINTS_DIR)/%.ln: 	     	$(UTSBASE)/common/inet/iptun/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/inet/ipd/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/inet/ipf/%.c
 	@($(LHEAD) $(LINT.c) $(IPFFLAGS) $< $(LTAIL))
 
@@ -2062,6 +2091,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/dmfe/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/drm/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/dr_sas/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/efe/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index d72cfb0b8f..06e7810f07 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2012 Milan Jurik. All rights reserved.
  */
 
@@ -565,8 +566,8 @@ char *isa_list = architecture;
 static pgcnt_t original_physmem = 0;
 
 #define	MIN_DEFAULT_MAXUSERS	8u
-#define	MAX_DEFAULT_MAXUSERS	2048u
-#define	MAX_MAXUSERS		4096u
+#define	MAX_DEFAULT_MAXUSERS	10000u
+#define	MAX_MAXUSERS		20000u
 
 void
 param_preset(void)
@@ -578,7 +579,7 @@ void
 param_calc(int platform_max_nprocs)
 {
 	/*
-	 * Default to about one "user" per megabyte, taking into
+	 * Default to about one "user" per 8MB, taking into
 	 * account both physical and virtual constraints.
 	 * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
 	 * converts pages to megs without integer overflow.
@@ -592,8 +593,9 @@ param_calc(int platform_max_nprocs)
 	if (maxusers == 0) {
 		pgcnt_t physmegs = physmem >> (20 - PAGESHIFT);
 		pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20;
-		maxusers = MIN(MAX(MIN(physmegs, virtmegs),
-		    MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS);
+		maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */
+		maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS);
+		maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS);
 	}
 	if (maxusers > MAX_MAXUSERS) {
 		maxusers = MAX_MAXUSERS;
diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c
index efaf5c37d1..a11098326b 100644
--- a/usr/src/uts/common/crypto/api/kcf_random.c
+++ b/usr/src/uts/common/crypto/api/kcf_random.c
@@ -71,6 +71,7 @@
 #include <sys/cpuvar.h>
 #include <sys/taskq.h>
 #include <rng/fips_random.h>
+#include <sys/strlog.h>
 
 #define	RNDPOOLSIZE		1024	/* Pool size in bytes */
 #define	MINEXTRACTBYTES		20
@@ -900,7 +901,8 @@ rnd_handler(void *arg)
 	int len = 0;
 
 	if (!rng_prov_found && rng_ok_to_log) {
-		cmn_err(CE_WARN, "No randomness provider enabled for "
+		(void) strlog(0, 0, 0, SL_NOTE,
+		    "No randomness provider enabled for "
 		    "/dev/random. Use cryptoadm(1M) to enable a provider.");
 		rng_ok_to_log = B_FALSE;
 	}
diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c
index f461fe048c..8b2760b237 100644
--- a/usr/src/uts/common/crypto/core/kcf_sched.c
+++ b/usr/src/uts/common/crypto/core/kcf_sched.c
@@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg)
 			case 0:
 			case -1:
 				/*
-				 * Woke up with no work to do. Check
-				 * if this thread should exit. We keep
-				 * at least kcf_minthreads.
+				 * Woke up with no work to do. Check if we
+				 * should lwp_exit() (which won't return). We
+				 * keep at least kcf_minthreads.
 				 */
 				if (kcfpool->kp_threads > kcf_minthreads) {
 					KCF_ATOMIC_DECR(kcfpool->kp_threads);
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..68be78a84f 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011, 2012 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/disp.h>
@@ -74,6 +75,32 @@
  * Putting threads on wait queues in random places while running in the
  * kernel might lead to all kinds of locking problems.
  *
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap.  The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization.  We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline.  Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value.  This will allow the zone to burst again.
+ * We can watch this behavior using the kstats.  The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value.  The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting.  When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
  * Accounting
  * ==========
  *
@@ -203,18 +230,28 @@ static void caps_update();
  */
 struct cap_kstat {
 	kstat_named_t	cap_value;
+	kstat_named_t	cap_baseline;
+	kstat_named_t	cap_effective;
+	kstat_named_t	cap_burst_limit;
+	kstat_named_t	cap_bursting;
 	kstat_named_t	cap_usage;
 	kstat_named_t	cap_nwait;
 	kstat_named_t	cap_below;
 	kstat_named_t	cap_above;
+	kstat_named_t	cap_above_base;
 	kstat_named_t	cap_maxusage;
 	kstat_named_t	cap_zonename;
 } cap_kstat = {
 	{ "value",	KSTAT_DATA_UINT64 },
+	{ "baseline",	KSTAT_DATA_UINT64 },
+	{ "effective",	KSTAT_DATA_UINT64 },
+	{ "burst_limit_sec", KSTAT_DATA_UINT64 },
+	{ "bursting_sec", KSTAT_DATA_UINT64 },
 	{ "usage",	KSTAT_DATA_UINT64 },
 	{ "nwait",	KSTAT_DATA_UINT64 },
 	{ "below_sec",	KSTAT_DATA_UINT64 },
 	{ "above_sec",	KSTAT_DATA_UINT64 },
+	{ "above_base_sec", KSTAT_DATA_UINT64 },
 	{ "maxusage",	KSTAT_DATA_UINT64 },
 	{ "zonename",	KSTAT_DATA_STRING },
 };
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
 	cap->cap_below = cap->cap_above = 0;
 	cap->cap_maxusage = 0;
 	cap->cap_usage = 0;
-	cap->cap_value = value;
+	cap->cap_value = cap->cap_chk_value = value;
 	waitq_unblock(&cap->cap_waitq);
 	if (CPUCAPS_OFF()) {
 		cpucaps_enabled = B_TRUE;
@@ -345,7 +382,7 @@ cap_disable(list_t *l, cpucap_t *cap)
 		cpucaps_enabled = B_FALSE;
 		cpucaps_clock_callout = NULL;
 	}
-	cap->cap_value = 0;
+	cap->cap_value = cap->cap_chk_value = 0;
 	cap->cap_project = NULL;
 	cap->cap_zone = NULL;
 	if (cap->cap_kstat != NULL) {
@@ -487,6 +524,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
  * The waitq_isempty check is performed without the waitq lock. If a new thread
  * is placed on the waitq right after the check, it will be picked up during the
  * next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
  */
 /* ARGSUSED */
 static void
@@ -494,15 +533,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
 {
 	ASSERT(MUTEX_HELD(&caps_lock));
 
-	if (cap->cap_usage >= cap->cap_value) {
+	if (cap->cap_base != 0) {
+		/*
+		 * Because of the way usage is calculated and decayed, its
+		 * possible for the zone to be slightly over its cap, but we
+		 * don't want to count that after we have reduced the effective
+		 * cap to the baseline.  That way the zone will be able to
+		 * burst again after the burst_limit has expired.
+		 */
+		if (cap->cap_usage > cap->cap_base &&
+		    cap->cap_chk_value == cap->cap_value) {
+			cap->cap_above_base++;
+
+			/*
+			 * If bursting is limited and we've been bursting
+			 * longer than we're supposed to, then set the
+			 * effective cap to the baseline.
+			 */
+			if (cap->cap_burst_limit != 0) {
+				cap->cap_bursting++;
+				if (cap->cap_bursting >= cap->cap_burst_limit)
+					cap->cap_chk_value = cap->cap_base;
+			}
+		} else if (cap->cap_bursting > 0) {
+			/*
+			 * We're not bursting now, but we were, decay the
+			 * bursting timer.
+			 */
+			cap->cap_bursting--;
+			/*
+			 * Reset the effective cap once we decay to 0 so we
+			 * can burst again.
+			 */
+			if (cap->cap_bursting == 0 &&
+			    cap->cap_chk_value != cap->cap_value)
+				cap->cap_chk_value = cap->cap_value;
+		}
+	}
+
+	if (cap->cap_usage >= cap->cap_chk_value) {
 		cap->cap_above++;
 	} else {
 		waitq_t *wq = &cap->cap_waitq;
 
 		cap->cap_below++;
 
-		if (!waitq_isempty(wq))
-			waitq_runone(wq);
+		if (!waitq_isempty(wq)) {
+			int i, ndequeue, p;
+
+			/*
+			 * Since this function is only called once per tick,
+			 * we can hit a situation where we have artificially
+			 * limited the project/zone below its cap.  This would
+			 * happen if we have multiple threads queued up but
+			 * only dequeued one thread/tick. To avoid this we
+			 * dequeue multiple threads, calculated based on the
+			 * usage percentage of the cap. It is possible that we
+			 * could dequeue too many threads and some of them
+			 * might be put back on the wait queue quickly, but
+			 * since we know that threads are on the wait queue
+			 * because we're capping, we know that there is unused
+			 * CPU cycles anyway, so this extra work would not
+			 * hurt. Also, the ndequeue number is only an upper
+			 * bound and we might dequeue less, depending on how
+			 * many threads are actually in the wait queue. The
+			 * ndequeue values are empirically derived and could be
+			 * adjusted or calculated in another way if necessary.
+			 */
+			p = (int)((100 * cap->cap_usage) / cap->cap_chk_value);
+			if (p >= 98)
+				ndequeue = 10;
+			else if (p >= 95)
+				ndequeue = 20;
+			else if (p >= 90)
+				ndequeue = 40;
+			else if (p >= 85)
+				ndequeue = 80;
+			else
+				ndequeue = 160;
+
+			for (i = 0; i < ndequeue; i++) {
+				waitq_runone(wq);
+				if (waitq_isempty(wq))
+					break;
+			}
+			DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i);
+		}
 	}
 }
 
@@ -629,14 +745,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
 		 * Remove all projects in this zone without caps
 		 * from the capped_projects list.
 		 */
-		if (project_cap->cap_value == MAX_USAGE) {
+		if (project_cap->cap_chk_value == MAX_USAGE) {
 			cap_project_disable(kpj);
 		}
 	} else if (CAP_DISABLED(project_cap)) {
 		/*
 		 * Add the project to capped_projects list.
 		 */
-		ASSERT(project_cap->cap_value == 0);
+		ASSERT(project_cap->cap_chk_value == 0);
 		cap_project_enable(kpj, MAX_USAGE);
 	}
 	mutex_exit(&caps_lock);
@@ -746,7 +862,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 		/*
 		 * No state transitions, just change the value
 		 */
-		cap->cap_value = value;
+		cap->cap_value = cap->cap_chk_value = value;
 	}
 
 	ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +873,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
 }
 
 /*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= MAXCAP);
+	if (base_val > MAXCAP)
+		base_val = MAXCAP;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = base_val * cap_tick_cost;
+	if (value < 0 || value > cap->cap_value)
+		value = 0;
+
+	cap->cap_base = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds.  A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	ASSERT(base_val <= INT_MAX);
+	/* Treat the default as 0 - no limit */
+	if (base_val == INT_MAX)
+		base_val = 0;
+	if (base_val > INT_MAX)
+		base_val = INT_MAX;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+
+	value = SEC_TO_TICK(base_val);
+	if (value < 0)
+		value = 0;
+
+	cap->cap_burst_limit = value;
+
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
  * The project is going away so disable its cap.
  */
 void
@@ -902,7 +1120,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		if (CAP_DISABLED(cap))
 			cap_project_enable(kpj, value);
 		else
-			cap->cap_value = value;
+			cap->cap_value = cap->cap_chk_value = value;
 	} else if (CAP_ENABLED(cap)) {
 		/*
 		 * User requested to drop a cap on the project. If it is part of
@@ -910,7 +1128,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
 		 * otherwise disable the cap.
 		 */
 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
-			cap->cap_value = MAX_USAGE;
+			cap->cap_value = cap->cap_chk_value = MAX_USAGE;
 		} else {
 			cap_project_disable(kpj);
 		}
@@ -948,6 +1166,26 @@ cpucaps_zone_get(zone_t *zone)
 }
 
 /*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+	return (zone->zone_cpucap != NULL ?
+	    (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
  * Charge project of thread t the time thread t spent on CPU since previously
  * adjusted.
  *
@@ -1045,7 +1283,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 
 	project_cap = kpj->kpj_cpucap;
 
-	if (project_cap->cap_usage >= project_cap->cap_value) {
+	if (project_cap->cap_usage >= project_cap->cap_chk_value) {
 		t->t_schedflag |= TS_PROJWAITQ;
 		rc = B_TRUE;
 	} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1297,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
 	} else {
 		cpucap_t *zone_cap = zone->zone_cpucap;
 
-		if (zone_cap->cap_usage >= zone_cap->cap_value) {
+		if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
 			t->t_schedflag |= TS_ZONEWAITQ;
 			rc = B_TRUE;
 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1133,6 +1371,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
 
 	capsp->cap_value.value.ui64 =
 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
+	capsp->cap_baseline.value.ui64 =
+	    ROUND_SCALE(cap->cap_base, cap_tick_cost);
+	capsp->cap_effective.value.ui64 =
+	    ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+	capsp->cap_burst_limit.value.ui64 =
+	    ROUND_SCALE(cap->cap_burst_limit, tick_sec);
 	capsp->cap_usage.value.ui64 =
 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
 	capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1384,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+	capsp->cap_above_base.value.ui64 =
+	    ROUND_SCALE(cap->cap_above_base, tick_sec);
+	capsp->cap_bursting.value.ui64 =
+	    ROUND_SCALE(cap->cap_bursting, tick_sec);
 	kstat_named_setstr(&capsp->cap_zonename, zonename);
 
 	return (0);
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index be92ba108b..9afcd81239 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -105,7 +109,7 @@ static void	cpu_resched(cpu_t *cp, pri_t tpri);
 /*
  * If this is set, only interrupt threads will cause kernel preemptions.
  * This is done by changing the value of kpreemptpri.  kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
  */
 int	only_intr_kpreempt;
 
@@ -252,7 +256,23 @@ dispinit(void)
 				maxglobpri = cl_maxglobpri;
 		}
 	}
-	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+	/*
+	 * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+	 * to say, maxclsyspri + 1.  However, over time, the system has used
+	 * more and more asynchronous kernel threads, with an increasing number
+	 * of these doing work on direct behalf of higher-level software (e.g.,
+	 * network processing).  This has led to potential priority inversions:
+	 * threads doing low-priority lengthy kernel work can effectively
+	 * delay kernel-level processing of higher-priority data. To minimize
+	 * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+	 * the kernel that runs at maxclsyspri will therefore induce kernel
+	 * preemption, and this priority should be used if/when an asynchronous
+	 * thread (or, as is often the case, task queue) is performing a task
+	 * on behalf of higher-level software (or any task that is otherwise
+	 * latency-sensitve).
+	 */
+	kpreemptpri = (pri_t)v.v_maxsyspri;
 	if (kpqpri == KPQPRI)
 		kpqpri = kpreemptpri;
 
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index 62301d65d8..c1c7da06ec 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -54,6 +55,179 @@
 #include <sys/cpucaps.h>
 
 /*
+ * The fair share scheduling class ensures that collections of processes
+ * (zones and projects) each get their configured share of CPU.  This is in
+ * contrast to the TS class which considers individual processes.
+ *
+ * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on
+ * projects using the project.cpu-shares rctl.  By default the value is 1
+ * and it can range from 0 - 64k.  A value of 0 means that processes in the
+ * collection will only get CPU resources when there are no other processes
+ * that need CPU. The cpu-share is used as one of the inputs to calculate a
+ * thread's "user-mode" priority (umdpri) for the scheduler.  The umdpri falls
+ * in the range 0-59.  FSS calculates other, internal, priorities which are not
+ * visible outside of the FSS class.
+ *
+ * The FSS class should approximate TS behavior when there are excess CPU
+ * resources.  When there is a backlog of runnable processes, then the share
+ * is used as input into the runnable process's priority calculation, where
+ * the final umdpri is used by the scheduler to determine when the process runs.
+ *
+ * Projects in a zone compete with each other for CPU time, receiving CPU
+ * allocation within a zone proportional to the project's share; at a higher
+ * level zones compete with each other, receiving allocation in a pset
+ * proportional to the zone's share.
+ *
+ * The FSS priority calculation consists of several parts.
+ *
+ * 1) Once per second the fss_update function runs. The first thing it does is
+ *    call fss_decay_usage. This function does three things.
+ *
+ * a) fss_decay_usage first decays the maxfsspri value for the pset.  This
+ *    value is used in the per-process priority calculation described in step
+ *    (2b).  The maxfsspri is decayed using the following formula:
+ *
+ *                      maxfsspri * fss_nice_decay[NZERO])
+ *        maxfsspri =  ------------------------------------
+ *                            FSS_DECAY_BASE
+ *
+ *
+ *     - NZERO is the default process priority (i.e. 20)
+ *
+ *    The fss_nice_decay array is a fixed set of values used to adjust the
+ *    decay rate of processes based on their nice value.  Entries in this
+ *    array are initialized in fss_init using the following formula:
+ *
+ *                        (FSS_DECAY_MAX - FSS_DECAY_MIN) * i
+ *       FSS_DECAY_MIN + -------------------------------------
+ *                               FSS_NICE_RANGE - 1
+ *
+ *     - FSS_DECAY_MIN is 82 = approximates 65% (82/128)
+ *     - FSS_DECAY_MAX is 108 = approximates 85% (108/128)
+ *     - FSS_NICE_RANGE is 40 (range is 0 - 39)
+ *
+ * b) The second thing fss_decay_usage does is update each project's "usage"
+ *    for the last second and then recalculates the project's "share usage".
+ *
+ *    The usage value is the recent CPU usage for all of the threads in the
+ *    project. It is decayed and updated this way:
+ *
+ *                  (usage * FSS_DECAY_USG)
+ *        usage =  ------------------------- + ticks;
+ *                       FSS_DECAY_BASE
+ *
+ *     - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide
+ *     - FSS_DECAY_USG is 96 - approximates 75% (96/128)
+ *     - ticks is updated whenever a process in this project is running
+ *       when the scheduler's tick processing fires. This is not a simple
+ *       counter, the values are based on the entries in the fss_nice_tick
+ *       array (see section 3 below). ticks is then reset to 0 so it can track
+ *       the next seconds worth of nice-adjusted time for the project.
+ *
+ * c) The third thing fss_decay_usage does is update each project's "share
+ *    usage" (shusage). This is the normalized usage value for the project and
+ *    is calculated this way:
+ *
+ *                pset_shares^2    zone_int_shares^2
+ *        usage * ------------- * ------------------
+ *                kpj_shares^2	   zone_ext_shares^2
+ *
+ *    - usage - see (1b) for more details
+ *    - pset_shares is the total of all *active* zone shares in the pset (by
+ *      default there is only one pset)
+ *    - kpj_shares is the individual project's share (project.cpu-shares rctl)
+ *    - zone_int_shares is the sum of shares of all active projects within the
+ *      zone (the zone-internal total)
+ *    - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl)
+ *
+ *    The shusage is used in step (2b) to calculate the thread's new internal
+ *    priority. A larger shusage value leads to a lower priority.
+ *
+ * 2) The fss_update function then calls fss_update_list to update the priority
+ *    of all threads. This does two things.
+ *
+ * a) First the thread's internal priority is decayed using the following
+ *    formula:
+ *
+ *                  fsspri * fss_nice_decay[nice_value])
+ *        fsspri =  ------------------------------------
+ *                            FSS_DECAY_BASE
+ *
+ *     - FSS_DECAY_BASE is 128 as described above
+ *
+ * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri
+ *    to update the user-mode priority (umdpri) of the runnable thread.
+ *    Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP)
+ *    are not updated at this time. The updated user-mode priority can cause
+ *    threads to change their position in the run queue.
+ *
+ *    The process's new internal fsspri is calculated using the following
+ *    formula. All runnable threads in the project will use the same shusage
+ *    and nrunnable values in their calculation.
+ *
+ *        fsspri += shusage * nrunnable * ticks
+ *
+ *     - shusage is the project's share usage, calculated in (1c)
+ *     - nrunnable is the number of runnable threads in the project
+ *     - ticks is the number of ticks this thread ran since the last fss_newpri
+ *       invocation.
+ *
+ *    Finally the process's new user-mode priority is calculated using the
+ *    following formula:
+ *
+ *                              (fsspri * umdprirange)
+ *        umdpri = maxumdpri - ------------------------
+ *                                    maxfsspri
+ *
+ *     - maxumdpri is MINCLSYSPRI - 1 (i.e. 59)
+ *     - umdprirange is maxumdpri - 1 (i.e. 58)
+ *     - maxfsspri is the largest fsspri seen so far, as we're iterating all
+ *       runnable processes
+ *
+ *    Thus, a higher internal priority (fsspri) leads to a lower user-mode
+ *    priority which means the thread runs less. The fsspri is higher when
+ *    the project's normalized share usage is higher, when the project has
+ *    more runnable threads, or when the thread has accumulated more run-time.
+ *
+ *    This code has various checks to ensure the resulting umdpri is in the
+ *    range 1-59.  See fss_newpri for more details.
+ *
+ * To reiterate, the above processing is performed once per second to recompute
+ * the runnable thread user-mode priorities.
+ *
+ * 3) The final major component in the priority calculation is the tick
+ *    processing which occurs on a thread that is running when the clock
+ *    calls fss_tick.
+ *
+ *    A thread can run continuously in user-land (compute-bound) for the
+ *    fss_quantum (see "dispadmin -c FSS -g" for the configurable properties).
+ *    The fss_quantum defaults to 11 (i.e. 11 ticks).
+ *
+ *    Once the quantum has been consumed, the thread will call fss_newpri to
+ *    recompute its umdpri priority, as described above in (2b). Threads that
+ *    were T_ONPROC at the one second interval when runnable thread priorities
+ *    were recalculated will have their umdpri priority recalculated when their
+ *    quanta expires.
+ *
+ *    To ensure that runnable threads within a project see the expected
+ *    round-robin behavior, there is a special case in fss_newpri for a thread
+ *    that has run for its quanta within the one second update interval.  See
+ *    the handling for the quanta_up parameter within fss_newpri.
+ *
+ *    Also of interest, the fss_tick code increments the project's tick value
+ *    using the fss_nice_tick array entry for the thread's nice value. The idea
+ *    behind the fss_nice_tick array is that the cost of a tick is lower at
+ *    positive nice values (so that it doesn't increase the project's usage
+ *    as much as normal) with a 50% drop at the maximum level and a 50%
+ *    increase at the minimum level. See (1b). The fss_nice_tick array is
+ *    initialized in fss_init using the following formula:
+ *
+ *         FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i)
+ *        --------------------------------------------------
+ *                          FSS_NICE_RANGE
+ *
+ *     - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0
+ *
  * FSS Data Structures:
  *
  *                 fsszone
@@ -72,7 +246,6 @@
  *                -----       -----       -----
  *               fssproj
  *
- *
  * That is, fsspsets contain a list of fsszone's that are currently active in
  * the pset, and a list of fssproj's, corresponding to projects with runnable
  * threads on the pset.  fssproj's in turn point to the fsszone which they
@@ -81,12 +254,6 @@
  * An fssproj_t is removed when there are no threads in it.
  *
  * An fsszone_t is removed when there are no projects with threads in it.
- *
- * Projects in a zone compete with each other for cpu time, receiving cpu
- * allocation within a zone proportional to fssproj->fssp_shares
- * (project.cpu-shares); at a higher level zones compete with each other,
- * receiving allocation in a pset proportional to fsszone->fssz_shares
- * (zone.cpu-shares).  See fss_decay_usage() for the precise formula.
  */
 
 static pri_t fss_init(id_t, int, classfuncs_t **);
@@ -186,7 +353,7 @@ static time_t	fss_minrun = 2;	/* t_pri becomes 59 within 2 secs */
 static time_t	fss_minslp = 2;	/* min time on sleep queue for hardswap */
 static int	fss_quantum = 11;
 
-static void	fss_newpri(fssproc_t *);
+static void	fss_newpri(fssproc_t *, boolean_t);
 static void	fss_update(void *);
 static int	fss_update_list(int);
 static void	fss_change_priority(kthread_t *, fssproc_t *);
@@ -718,17 +885,55 @@ fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
 }
 
 /*
- * Calculate the new cpupri based on the usage, the number of shares and
- * the number of active threads.  Reset the tick counter for this thread.
+ * Calculate the new fss_umdpri based on the usage, the normalized share usage
+ * and the number of active threads.  Reset the tick counter for this thread.
+ *
+ * When calculating the new priority using the standard formula we can hit
+ * a scenario where we don't have good round-robin behavior.  This would be
+ * most commonly seen when there is a zone with lots of runnable threads.
+ * In the bad scenario we will see the following behavior when using the
+ * standard formula and these conditions:
+ *
+ *	- there are multiple runnable threads in the zone (project)
+ *	- the fssps_maxfsspri is a very large value
+ *	- (we also know all of these threads will use the project's
+ *	    fssp_shusage)
+ *
+ * Under these conditions, a thread with a low fss_fsspri value is chosen
+ * to run and the thread gets a high fss_umdpri.  This thread can run for
+ * its full quanta (fss_timeleft) at which time fss_newpri is called to
+ * calculate the thread's new priority.
+ *
+ * In this case, because the newly calculated fsspri value is much smaller
+ * (orders of magnitude) than the fssps_maxfsspri value, if we used the
+ * standard formula the thread will still get a high fss_umdpri value and
+ * will run again for another quanta, even though there are other runnable
+ * threads in the project.
+ *
+ * For a thread that is runnable for a long time, the thread can continue
+ * to run for many quanta (totaling many seconds) before the thread's fsspri
+ * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back
+ * down to 1.  This behavior also keeps the fssps_maxfsspr at a high value,
+ * so that the next runnable thread might repeat this cycle.
+ *
+ * This leads to the case where we don't have round-robin behavior at quanta
+ * granularity, but instead, runnable threads within the project only run
+ * at several second intervals.
+ *
+ * To prevent this scenario from occuring, when a thread has consumed its
+ * quanta and there are multiple runnable threads in the project, we
+ * immediately cause the thread to hit fssps_maxfsspri so that it gets
+ * reset back to 1 and another runnable thread in the project can run.
  */
 static void
-fss_newpri(fssproc_t *fssproc)
+fss_newpri(fssproc_t *fssproc, boolean_t quanta_up)
 {
 	kthread_t *tp;
 	fssproj_t *fssproj;
 	fsspset_t *fsspset;
 	fsszone_t *fsszone;
 	fsspri_t fsspri, maxfsspri;
+	uint32_t n_runnable;
 	pri_t invpri;
 	uint32_t ticks;
 
@@ -751,25 +956,43 @@ fss_newpri(fssproc_t *fssproc)
 	fsspset = FSSPROJ2FSSPSET(fssproj);
 	disp_lock_enter_high(&fsspset->fssps_displock);
 
+	ticks = fssproc->fss_ticks;
+	fssproc->fss_ticks = 0;
+
 	if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) {
 		/*
 		 * Special case: threads with no shares.
 		 */
 		fssproc->fss_umdpri = fss_minglobpri;
-		fssproc->fss_ticks = 0;
 		disp_lock_exit_high(&fsspset->fssps_displock);
 		return;
 	}
 
-	/*
-	 * fsspri += shusage * nrunnable * ticks
-	 */
-	ticks = fssproc->fss_ticks;
-	fssproc->fss_ticks = 0;
-	fsspri = fssproc->fss_fsspri;
-	fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks;
+	maxfsspri = fsspset->fssps_maxfsspri;
+	n_runnable = fssproj->fssp_runnable;
+
+	if (quanta_up && n_runnable > 1) {
+		fsspri = maxfsspri;
+	} else {
+		/*
+		 * fsspri += fssp_shusage * nrunnable * ticks
+		 * If all three values are non-0, this typically calculates to
+		 * a large number (sometimes > 1M, sometimes > 100B) due to
+		 * fssp_shusage which can be > 1T.
+		 */
+		fsspri = fssproc->fss_fsspri;
+		fsspri += fssproj->fssp_shusage * n_runnable * ticks;
+	}
+
 	fssproc->fss_fsspri = fsspri;
 
+	/*
+	 * fss_maxumdpri is normally 59, since FSS priorities are 0-59.
+	 * If the previous calculation resulted in 0 (e.g. was 0 and added 0
+	 * because ticks == 0), then instead of 0, we use the largest priority,
+	 * which is still small in comparison to the large numbers we typically
+	 * see.
+	 */
 	if (fsspri < fss_maxumdpri)
 		fsspri = fss_maxumdpri;	/* so that maxfsspri is != 0 */
 
@@ -783,12 +1006,16 @@ fss_newpri(fssproc_t *fssproc)
 	 * If this thread's fsspri is greater than the previous largest
 	 * fsspri, then record it as the new high and priority for this
 	 * thread will be one (the lowest priority assigned to a thread
-	 * that has non-zero shares).
+	 * that has non-zero shares). Because of this check, maxfsspri can
+	 * change as this function is called via the
+	 * fss_update -> fss_update_list -> fss_newpri code path to update
+	 * all runnable threads. See the code in fss_update for how we
+	 * mitigate this issue.
+	 *
 	 * Note that this formula cannot produce out of bounds priority
-	 * values; if it is changed, additional checks may need  to  be
+	 * values (0-59); if it is changed, additional checks may need to be
 	 * added.
 	 */
-	maxfsspri = fsspset->fssps_maxfsspri;
 	if (fsspri >= maxfsspri) {
 		fsspset->fssps_maxfsspri = fsspri;
 		disp_lock_exit_high(&fsspset->fssps_displock);
@@ -801,8 +1028,9 @@ fss_newpri(fssproc_t *fssproc)
 }
 
 /*
- * Decays usages of all running projects and resets their tick counters.
- * Called once per second from fss_update() after updating priorities.
+ * Decays usages of all running projects, resets their tick counters and
+ * calcluates the projects normalized share usage. Called once per second from
+ * fss_update().
  */
 static void
 fss_decay_usage()
@@ -814,6 +1042,7 @@ fss_decay_usage()
 	fsszone_t *fsszone;
 	fsspri_t maxfsspri;
 	int psetid;
+	struct zone *zp;
 
 	mutex_enter(&fsspsets_lock);
 	/*
@@ -824,6 +1053,8 @@ fss_decay_usage()
 		fsspset = &fsspsets[psetid];
 		mutex_enter(&fsspset->fssps_lock);
 
+		fsspset->fssps_gen++;
+
 		if (fsspset->fssps_cpupart == NULL ||
 		    (fssproj = fsspset->fssps_list) == NULL) {
 			mutex_exit(&fsspset->fssps_lock);
@@ -836,6 +1067,8 @@ fss_decay_usage()
 		 */
 		disp_lock_enter(&fsspset->fssps_displock);
 
+		pset_shares = fsspset->fssps_shares;
+
 		maxfsspri = (fsspset->fssps_maxfsspri *
 		    fss_nice_decay[NZERO]) / FSS_DECAY_BASE;
 		if (maxfsspri < fss_maxumdpri)
@@ -843,16 +1076,31 @@ fss_decay_usage()
 		fsspset->fssps_maxfsspri = maxfsspri;
 
 		do {
+			fsszone = fssproj->fssp_fsszone;
+			zp = fsszone->fssz_zone;
+
 			/*
-			 * Decay usage for each project running on
-			 * this cpu partition.
+			 * Reset zone's FSS stats if they are from a
+			 * previous cycle.
+			 */
+			if (fsspset->fssps_gen != zp->zone_fss_gen) {
+				zp->zone_fss_gen = fsspset->fssps_gen;
+				zp->zone_run_ticks = 0;
+			}
+
+			/*
+			 * Decay project usage, then add in this cycle's
+			 * nice tick value.
 			 */
 			fssproj->fssp_usage =
 			    (fssproj->fssp_usage * FSS_DECAY_USG) /
-			    FSS_DECAY_BASE + fssproj->fssp_ticks;
+			    FSS_DECAY_BASE +
+			    fssproj->fssp_ticks;
+
 			fssproj->fssp_ticks = 0;
+			zp->zone_run_ticks += fssproj->fssp_tick_cnt;
+			fssproj->fssp_tick_cnt = 0;
 
-			fsszone = fssproj->fssp_fsszone;
 			/*
 			 * Readjust the project's number of shares if it has
 			 * changed since we checked it last time.
@@ -871,18 +1119,55 @@ fss_decay_usage()
 			 * Readjust the zone's number of shares if it
 			 * has changed since we checked it last time.
 			 */
-			zone_ext_shares = fsszone->fssz_zone->zone_shares;
+			zone_ext_shares = zp->zone_shares;
 			if (fsszone->fssz_rshares != zone_ext_shares) {
 				if (fsszone->fssz_runnable != 0) {
 					fsspset->fssps_shares -=
 					    fsszone->fssz_rshares;
 					fsspset->fssps_shares +=
 					    zone_ext_shares;
+					pset_shares = fsspset->fssps_shares;
 				}
 				fsszone->fssz_rshares = zone_ext_shares;
 			}
 			zone_int_shares = fsszone->fssz_shares;
-			pset_shares = fsspset->fssps_shares;
+
+			/*
+			 * If anything is runnable in the project, track the
+			 * overall project share percent for monitoring useage.
+			 */
+			if (fssproj->fssp_runnable > 0) {
+				uint32_t zone_shr_pct;
+				uint32_t int_shr_pct;
+
+				/*
+				 * Times 1000 to get tenths of a percent
+				 *
+				 *		  zone_ext_shares
+				 * zone_shr_pct = ---------------
+				 *		  pset_shares
+				 *
+				 *		  kpj_shares
+				 * int_shr_pct =  ---------------
+				 *		  zone_int_shares
+				 */
+				if (pset_shares == 0 || zone_int_shares == 0) {
+					fssproj->fssp_shr_pct = 0;
+				} else {
+					zone_shr_pct =
+					    (zone_ext_shares * 1000) /
+					    pset_shares;
+					int_shr_pct = (kpj_shares * 1000) /
+					    zone_int_shares;
+					fssproj->fssp_shr_pct =
+					    (zone_shr_pct * int_shr_pct) /
+					    1000;
+				}
+			} else {
+				DTRACE_PROBE1(fss__prj__norun, fssproj_t *,
+				    fssproj);
+			}
+
 			/*
 			 * Calculate fssp_shusage value to be used
 			 * for fsspri increments for the next second.
@@ -890,10 +1175,22 @@ fss_decay_usage()
 			if (kpj_shares == 0 || zone_ext_shares == 0) {
 				fssproj->fssp_shusage = 0;
 			} else if (FSSPROJ2KPROJ(fssproj) == proj0p) {
+				uint32_t zone_shr_pct;
+
 				/*
 				 * Project 0 in the global zone has 50%
-				 * of its zone.
+				 * of its zone. See calculation above for
+				 * the zone's share percent.
 				 */
+				if (pset_shares == 0)
+					zone_shr_pct = 1000;
+				else
+					zone_shr_pct =
+					    (zone_ext_shares * 1000) /
+					    pset_shares;
+
+				fssproj->fssp_shr_pct = zone_shr_pct / 2;
+
 				fssproj->fssp_shusage = (fssproj->fssp_usage *
 				    zone_int_shares * zone_int_shares) /
 				    (zone_ext_shares * zone_ext_shares);
@@ -925,6 +1222,10 @@ fss_decay_usage()
 				 *			pset_shares^2
 				 * shusage = usage * ----------------------
 				 *			zone_ext_shares^2
+				 *
+				 * shusage is one input to calculating fss_pri
+				 * in fss_newpri(). Larger values tend toward
+				 * lower priorities for processes in the proj.
 				 */
 				fssproj->fssp_shusage = fssproj->fssp_usage *
 				    pset_shares * zone_int_shares;
@@ -996,6 +1297,10 @@ fss_change_priority(kthread_t *t, fssproc_t *fssproc)
  * thread pointer.  Each list has its own lock.  This avoids blocking all
  * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs.
  * fss_update traverses each list in turn.
+ *
+ * Each time we're run (once/second) we may start at the next list and iterate
+ * through all of the lists. By starting with a different list, we mitigate any
+ * effects we would see updating the fssps_maxfsspri value in fss_newpri.
  */
 static void
 fss_update(void *arg)
@@ -1021,7 +1326,7 @@ fss_update(void *arg)
 	do {
 		/*
 		 * If this is the first list after the current marker to have
-		 * threads with priorities updates, advance the marker to this
+		 * threads with priority updates, advance the marker to this
 		 * list for the next time fss_update runs.
 		 */
 		if (fss_update_list(i) &&
@@ -1050,6 +1355,7 @@ fss_update_list(int i)
 	fssproc_t *fssproc;
 	fssproj_t *fssproj;
 	fsspri_t fsspri;
+	pri_t fss_umdpri;
 	kthread_t *t;
 	int updated = 0;
 
@@ -1073,6 +1379,7 @@ fss_update_list(int i)
 		fssproj = FSSPROC2FSSPROJ(fssproc);
 		if (fssproj == NULL)
 			goto next;
+
 		if (fssproj->fssp_shares != 0) {
 			/*
 			 * Decay fsspri value.
@@ -1091,16 +1398,21 @@ fss_update_list(int i)
 			 */
 			t->t_trapret = 1;
 			aston(t);
+			if (t->t_state == TS_ONPROC)
+				DTRACE_PROBE1(fss__onproc, fssproc_t *,
+				    fssproc);
 			goto next;
 		}
-		fss_newpri(fssproc);
+		fss_newpri(fssproc, B_FALSE);
 		updated = 1;
 
+		fss_umdpri = fssproc->fss_umdpri;
+
 		/*
 		 * Only dequeue the thread if it needs to be moved; otherwise
 		 * it should just round-robin here.
 		 */
-		if (t->t_pri != fssproc->fss_umdpri)
+		if (t->t_pri != fss_umdpri)
 			fss_change_priority(t, fssproc);
 next:
 		thread_unlock(t);
@@ -1624,7 +1936,7 @@ fss_forkret(kthread_t *t, kthread_t *ct)
 	thread_lock(t);
 
 	fssproc = FSSPROC(t);
-	fss_newpri(fssproc);
+	fss_newpri(fssproc, B_FALSE);
 	fssproc->fss_timeleft = fss_quantum;
 	t->t_pri = fssproc->fss_umdpri;
 	ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
@@ -1725,7 +2037,7 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
 	fssproc->fss_uprilim = reqfssuprilim;
 	fssproc->fss_upri = reqfssupri;
 	fssproc->fss_nice = nice;
-	fss_newpri(fssproc);
+	fss_newpri(fssproc, B_FALSE);
 
 	if ((fssproc->fss_flags & FSSKPRI) != 0) {
 		thread_unlock(t);
@@ -2180,6 +2492,7 @@ fss_tick(kthread_t *t)
 		fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj);
 		disp_lock_enter_high(&fsspset->fssps_displock);
 		fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice];
+		fssproj->fssp_tick_cnt++;
 		fssproc->fss_ticks++;
 		disp_lock_exit_high(&fsspset->fssps_displock);
 	}
@@ -2223,7 +2536,7 @@ fss_tick(kthread_t *t)
 			}
 			fssproc->fss_flags &= ~FSSRESTORE;
 
-			fss_newpri(fssproc);
+			fss_newpri(fssproc, B_TRUE);
 			new_pri = fssproc->fss_umdpri;
 			ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
 
@@ -2262,7 +2575,7 @@ fss_tick(kthread_t *t)
 		 * queue so that it gets charged for the CPU time from its
 		 * quantum even before that quantum expires.
 		 */
-		fss_newpri(fssproc);
+		fss_newpri(fssproc, B_FALSE);
 		if (t->t_pri != fssproc->fss_umdpri)
 			fss_change_priority(t, fssproc);
 
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 5ed9110251..63a08483f8 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -1049,6 +1049,8 @@ installctx(
 	ctx->free_op = free;
 	ctx->arg = arg;
 	ctx->next = t->t_ctx;
+	ctx->save_ts = 0;
+	ctx->restore_ts = 0;
 	t->t_ctx = ctx;
 }
 
@@ -1120,9 +1122,12 @@ savectx(kthread_t *t)
 	struct ctxop *ctx;
 
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->save_op != NULL)
+	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+		if (ctx->save_op != NULL) {
+			ctx->save_ts = gethrtime_unscaled();
 			(ctx->save_op)(ctx->arg);
+		}
+	}
 }
 
 void
@@ -1131,9 +1136,12 @@ restorectx(kthread_t *t)
 	struct ctxop *ctx;
 
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->restore_op != NULL)
+	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+		if (ctx->restore_op != NULL) {
+			ctx->restore_ts = gethrtime_unscaled();
 			(ctx->restore_op)(ctx->arg);
+		}
+	}
 }
 
 void
diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c
index 5013661588..8ef84d1322 100644
--- a/usr/src/uts/common/dtrace/dtrace.c
+++ b/usr/src/uts/common/dtrace/dtrace.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
@@ -116,7 +116,7 @@
 int		dtrace_destructive_disallow = 0;
 dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 size_t		dtrace_difo_maxsize = (256 * 1024);
-dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
+dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
 size_t		dtrace_global_maxsize = (16 * 1024);
 size_t		dtrace_actions_max = (16 * 1024);
 size_t		dtrace_retain_max = 1024;
@@ -171,6 +171,7 @@ static dtrace_provider_t *dtrace_provider;	/* provider list */
 static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
 static int		dtrace_opens;		/* number of opens */
 static int		dtrace_helpers;		/* number of helpers */
+static int		dtrace_getf;		/* number of unpriv getf()s */
 static void		*dtrace_softstate;	/* softstate pointer */
 static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
 static dtrace_hash_t	*dtrace_byfunc;		/* probes hashed by function */
@@ -267,17 +268,22 @@ dtrace_id_t		dtrace_probeid_error;	/* special ERROR probe */
 
 /*
  * DTrace Helper Tracing Variables
+ *
+ * These variables should be set dynamically to enable helper tracing.  The
+ * only variables that should be set are dtrace_helptrace_enable (which should
+ * be set to a non-zero value to allocate helper tracing buffers on the next
+ * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
+ * non-zero value to deallocate helper tracing buffers on the next close of
+ * /dev/dtrace).  When (and only when) helper tracing is disabled, the
+ * buffer size may also be set via dtrace_helptrace_bufsize.
  */
-uint32_t dtrace_helptrace_next = 0;
-uint32_t dtrace_helptrace_nlocals;
-char	*dtrace_helptrace_buffer;
-int	dtrace_helptrace_bufsize = 512 * 1024;
-
-#ifdef DEBUG
-int	dtrace_helptrace_enabled = 1;
-#else
-int	dtrace_helptrace_enabled = 0;
-#endif
+int			dtrace_helptrace_enable = 0;
+int			dtrace_helptrace_disable = 0;
+int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
+uint32_t		dtrace_helptrace_nlocals;
+static dtrace_helptrace_t *dtrace_helptrace_buffer;
+static uint32_t		dtrace_helptrace_next = 0;
+static int		dtrace_helptrace_wrapped = 0;
 
 /*
  * DTrace Error Hashing
@@ -373,8 +379,8 @@ static kmutex_t dtrace_errlock;
  * disallow all negative sizes.  Ranges of size 0 are allowed.
  */
 #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
-	((testaddr) - (baseaddr) < (basesz) && \
-	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
+	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 	(testaddr) + (testsz) >= (testaddr))
 
 /*
@@ -475,6 +481,8 @@ static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
     dtrace_optval_t);
 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
+static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
+static void dtrace_getf_barrier(void);
 
 /*
  * DTrace Probe Context Functions
@@ -619,7 +627,7 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 	 * up both thread-local variables and any global dynamically-allocated
 	 * variables.
 	 */
-	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
+	if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
 	    vstate->dtvs_dynvars.dtds_size)) {
 		dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 		uintptr_t base = (uintptr_t)dstate->dtds_base +
@@ -686,6 +694,7 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
     dtrace_vstate_t *vstate)
 {
 	volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
+	file_t *fp;
 
 	/*
 	 * If we hold the privilege to read from kernel memory, then
@@ -703,10 +712,99 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 	/*
 	 * We're allowed to read from our own string table.
 	 */
-	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
+	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
 	    mstate->dtms_difo->dtdo_strlen))
 		return (1);
 
+	if (vstate->dtvs_state != NULL &&
+	    dtrace_priv_proc(vstate->dtvs_state, mstate)) {
+		proc_t *p;
+
+		/*
+		 * When we have privileges to the current process, there are
+		 * several context-related kernel structures that are safe to
+		 * read, even absent the privilege to read from kernel memory.
+		 * These reads are safe because these structures contain only
+		 * state that (1) we're permitted to read, (2) is harmless or
+		 * (3) contains pointers to additional kernel state that we're
+		 * not permitted to read (and as such, do not present an
+		 * opportunity for privilege escalation).  Finally (and
+		 * critically), because of the nature of their relation with
+		 * the current thread context, the memory associated with these
+		 * structures cannot change over the duration of probe context,
+		 * and it is therefore impossible for this memory to be
+		 * deallocated and reallocated as something else while it's
+		 * being operated upon.
+		 */
+		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
+			return (1);
+
+		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+		    sz, curthread->t_procp, sizeof (proc_t))) {
+			return (1);
+		}
+
+		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cred, sizeof (cred_t))) {
+			return (1);
+		}
+
+		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
+			return (1);
+		}
+
+		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+			return (1);
+		}
+	}
+
+	if ((fp = mstate->dtms_getf) != NULL) {
+		uintptr_t psz = sizeof (void *);
+		vnode_t *vp;
+		vnodeops_t *op;
+
+		/*
+		 * When getf() returns a file_t, the enabling is implicitly
+		 * granted the (transient) right to read the returned file_t
+		 * as well as the v_path and v_op->vnop_name of the underlying
+		 * vnode.  These accesses are allowed after a successful
+		 * getf() because the members that they refer to cannot change
+		 * once set -- and the barrier logic in the kernel's closef()
+		 * path assures that the file_t and its referenced vode_t
+		 * cannot themselves be stale (that is, it impossible for
+		 * either dtms_getf itself or its f_vnode member to reference
+		 * freed memory).
+		 */
+		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
+			return (1);
+
+		if ((vp = fp->f_vnode) != NULL) {
+			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
+				return (1);
+
+			if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
+			    vp->v_path, strlen(vp->v_path) + 1)) {
+				return (1);
+			}
+
+			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
+				return (1);
+
+			if ((op = vp->v_op) != NULL &&
+			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+				return (1);
+			}
+
+			if (op != NULL && op->vnop_name != NULL &&
+			    DTRACE_INRANGE(addr, sz, op->vnop_name,
+			    strlen(op->vnop_name) + 1)) {
+				return (1);
+			}
+		}
+	}
+
 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 	*illval = addr;
 	return (0);
@@ -746,7 +844,7 @@ static int
 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
     dtrace_vstate_t *vstate)
 {
-	size_t sz;
+	size_t sz, strsize;
 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 
 	/*
@@ -756,11 +854,24 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 		return (1);
 
-	if (type->dtdt_kind == DIF_TYPE_STRING)
-		sz = dtrace_strlen(src,
-		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
-	else
+	if (type->dtdt_kind == DIF_TYPE_STRING) {
+		dtrace_state_t *state = vstate->dtvs_state;
+
+		if (state != NULL) {
+			strsize = state->dts_options[DTRACEOPT_STRSIZE];
+		} else {
+			/*
+			 * In helper context, we have a NULL state; fall back
+			 * to using the system-wide default for the string size
+			 * in this case.
+			 */
+			strsize = dtrace_strsize_default;
+		}
+
+		sz = dtrace_strlen(src, strsize) + 1;
+	} else {
 		sz = type->dtdt_size;
+	}
 
 	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
 }
@@ -1085,8 +1196,7 @@ dtrace_priv_proc_common_zone(dtrace_state_t *state)
 	 */
 	ASSERT(s_cr != NULL);
 
-	if ((cr = CRED()) != NULL &&
-	    s_cr->cr_zone == cr->cr_zone)
+	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
 		return (1);
 
 	return (0);
@@ -1209,19 +1319,17 @@ dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
 		mode = pops->dtps_mode(prov->dtpv_arg,
 		    probe->dtpr_id, probe->dtpr_arg);
 
-		ASSERT((mode & DTRACE_MODE_USER) ||
-		    (mode & DTRACE_MODE_KERNEL));
-		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
-		    (mode & DTRACE_MODE_NOPRIV_DROP));
+		ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
+		ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
+		    DTRACE_MODE_NOPRIV_DROP));
 	}
 
 	/*
 	 * If the dte_cond bits indicate that this consumer is only allowed to
-	 * see user-mode firings of this probe, call the provider's dtps_mode()
-	 * entry point to check that the probe was fired while in a user
-	 * context.  If that's not the case, use the policy specified by the
-	 * provider to determine if we drop the probe or merely restrict
-	 * operation.
+	 * see user-mode firings of this probe, check that the probe was fired
+	 * while in a user context.  If that's not the case, use the policy
+	 * specified by the provider to determine if we drop the probe or
+	 * merely restrict operation.
 	 */
 	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
 		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
@@ -1288,6 +1396,15 @@ dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
 		}
 	}
 
+	/*
+	 * By merits of being in this code path at all, we have limited
+	 * privileges.  If the provider has indicated that limited privileges
+	 * are to denote restricted operation, strip off the ability to access
+	 * arguments.
+	 */
+	if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
+		mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+
 	return (1);
 }
 
@@ -2924,7 +3041,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
 	}
 
 	case DIF_VAR_CURTHREAD:
-		if (!dtrace_priv_kernel(state))
+		if (!dtrace_priv_proc(state, mstate))
 			return (0);
 		return ((uint64_t)(uintptr_t)curthread);
 
@@ -4452,11 +4569,35 @@ case DIF_SUBR_GETMAJOR:
 		break;
 	}
 
+	case DIF_SUBR_GETF: {
+		uintptr_t fd = tupregs[0].dttk_value;
+		uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo;
+		file_t *fp;
+
+		if (!dtrace_priv_proc(state, mstate)) {
+			regs[rd] = NULL;
+			break;
+		}
+
+		/*
+		 * This is safe because fi_nfiles only increases, and the
+		 * fi_list array is not freed when the array size doubles.
+		 * (See the comment in flist_grow() for details on the
+		 * management of the u_finfo structure.)
+		 */
+		fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL;
+
+		mstate->dtms_getf = fp;
+		regs[rd] = (uintptr_t)fp;
+		break;
+	}
+
 	case DIF_SUBR_CLEANPATH: {
 		char *dest = (char *)mstate->dtms_scratch_ptr, c;
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 		uintptr_t src = tupregs[0].dttk_value;
 		int i = 0, j = 0;
+		zone_t *z;
 
 		if (!dtrace_strcanload(src, size, mstate, vstate)) {
 			regs[rd] = NULL;
@@ -4555,6 +4696,23 @@ next:
 		} while (c != '\0');
 
 		dest[j] = '\0';
+
+		if (mstate->dtms_getf != NULL &&
+		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+			/*
+			 * If we've done a getf() as a part of this ECB and we
+			 * don't have kernel access (and we're not in the global
+			 * zone), check if the path we cleaned up begins with
+			 * the zone's root path, and trim it off if so.  Note
+			 * that this is an output cleanliness issue, not a
+			 * security issue: knowing one's zone root path does
+			 * not enable privilege escalation.
+			 */
+			if (strstr(dest, z->zone_rootpath) == dest)
+				dest += strlen(z->zone_rootpath) - 1;
+		}
+
 		regs[rd] = (uintptr_t)dest;
 		mstate->dtms_scratch_ptr += size;
 		break;
@@ -4939,71 +5097,50 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
 				pc = DIF_INSTR_LABEL(instr);
 			break;
 		case DIF_OP_RLDSB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSB:
 			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDSH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSH:
 			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDSW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSW:
 			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDUB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUB:
 			regs[rd] = dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDUH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUH:
 			regs[rd] = dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDUW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUW:
 			regs[rd] = dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDX:
-			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDX:
 			regs[rd] = dtrace_load64(regs[r1]);
@@ -5940,6 +6077,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
 
 		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
 		mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
+		mstate.dtms_getf = NULL;
+
 		*flags &= ~CPU_DTRACE_ERROR;
 
 		if (prov == dtrace_provider) {
@@ -6736,7 +6875,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
 		priv = DTRACE_PRIV_ALL;
 	} else {
 		*uidp = crgetuid(cr);
-		*zoneidp = crgetzoneid(cr);
+		*zoneidp = crgetzonedid(cr);
 
 		priv = 0;
 		if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
@@ -7232,7 +7371,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
 	provider->dtpv_priv.dtpp_flags = priv;
 	if (cr != NULL) {
 		provider->dtpv_priv.dtpp_uid = crgetuid(cr);
-		provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
+		provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr);
 	}
 	provider->dtpv_pops = *pops;
 
@@ -7843,6 +7982,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
 	uint32_t priv;
 	uid_t uid;
 	zoneid_t zoneid;
+	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	dtrace_ecb_create_cache = NULL;
@@ -7857,8 +7997,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
 	}
 
 	dtrace_probekey(desc, &pkey);
-	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
-	    &priv, &uid, &zoneid);
+	dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid);
+
+	if ((priv & DTRACE_PRIV_ZONEOWNER) &&
+	    state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) {
+		/*
+		 * If we have the privilege of instrumenting all zones but we
+		 * have been told to instrument but one, we will spoof this up
+		 * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes
+		 * of dtrace_match().  (Note that DTRACEOPT_ZONE is not for
+		 * security but rather for performance: it allows the global
+		 * zone to instrument USDT probes in a local zone without
+		 * requiring all zones to be instrumented.)
+		 */
+		priv &= ~DTRACE_PRIV_ZONEOWNER;
+		zoneid = state->dts_options[DTRACEOPT_ZONE];
+	}
 
 	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
 	    enab));
@@ -8443,6 +8597,20 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
 			    subr == DIF_SUBR_COPYOUTSTR) {
 				dp->dtdo_destructive = 1;
 			}
+
+			if (subr == DIF_SUBR_GETF) {
+				/*
+				 * If we have a getf() we need to record that
+				 * in our state.  Note that our state can be
+				 * NULL if this is a helper -- but in that
+				 * case, the call to getf() is itself illegal,
+				 * and will be caught (slightly later) when
+				 * the helper is validated.
+				 */
+				if (vstate->dtvs_state != NULL)
+					vstate->dtvs_state->dts_getf++;
+			}
+
 			break;
 		case DIF_OP_PUSHTR:
 			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
@@ -13085,6 +13253,22 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
 
 	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
 
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to bump our zone's count, and (if
+		 * this is the first enabling to have an unprivileged call
+		 * to getf()) we need to hook into closef().
+		 */
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+		if (dtrace_getf++ == 0) {
+			ASSERT(dtrace_closef == NULL);
+			dtrace_closef = dtrace_getf_barrier;
+		}
+	}
+
 	/*
 	 * Now it's time to actually fire the BEGIN probe.  We need to disable
 	 * interrupts here both to record the CPU on which we fired the BEGIN
@@ -13201,6 +13385,24 @@ dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
 	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
 	dtrace_sync();
 
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to lower our zone's count, and (if
+		 * this is the last enabling to have an unprivileged call
+		 * to getf()) we need to clear the closef() hook.
+		 */
+		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+		ASSERT(dtrace_closef == dtrace_getf_barrier);
+		ASSERT(dtrace_getf > 0);
+
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+		if (--dtrace_getf == 0)
+			dtrace_closef = NULL;
+	}
+
 	return (0);
 }
 
@@ -13507,10 +13709,10 @@ dtrace_helper_trace(dtrace_helper_action_t *helper,
     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
 {
 	uint32_t size, next, nnext, i;
-	dtrace_helptrace_t *ent;
+	dtrace_helptrace_t *ent, *buffer;
 	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 
-	if (!dtrace_helptrace_enabled)
+	if ((buffer = dtrace_helptrace_buffer) == NULL)
 		return;
 
 	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
@@ -13538,10 +13740,12 @@ dtrace_helper_trace(dtrace_helper_action_t *helper,
 	/*
 	 * We have our slot; fill it in.
 	 */
-	if (nnext == size)
+	if (nnext == size) {
+		dtrace_helptrace_wrapped++;
 		next = 0;
+	}
 
-	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
+	ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
 	ent->dtht_helper = helper;
 	ent->dtht_where = where;
 	ent->dtht_nlocals = vstate->dtvs_nlocals;
@@ -13575,7 +13779,7 @@ dtrace_helper(int which, dtrace_mstate_t *mstate,
 	dtrace_helper_action_t *helper;
 	dtrace_vstate_t *vstate;
 	dtrace_difo_t *pred;
-	int i, trace = dtrace_helptrace_enabled;
+	int i, trace = dtrace_helptrace_buffer != NULL;
 
 	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
 
@@ -14761,6 +14965,23 @@ dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
 	dtrace_toxranges++;
 }
 
+static void
+dtrace_getf_barrier()
+{
+	/*
+	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+	 * that contain calls to getf(), this routine will be called on every
+	 * closef() before either the underlying vnode is released or the
+	 * file_t itself is freed.  By the time we are here, it is essential
+	 * that the file_t can no longer be accessed from a call to getf()
+	 * in probe context -- that assures that a dtrace_sync() can be used
+	 * to clear out any enablings referring to the old structures.
+	 */
+	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+	    kcred->cr_zone->zone_dtrace_getf != 0)
+		dtrace_sync();
+}
+
 /*
  * DTrace Driver Cookbook Functions
  */
@@ -14875,17 +15096,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 	mutex_exit(&cpu_lock);
 
 	/*
-	 * If DTrace helper tracing is enabled, we need to allocate the
-	 * trace buffer and initialize the values.
-	 */
-	if (dtrace_helptrace_enabled) {
-		ASSERT(dtrace_helptrace_buffer == NULL);
-		dtrace_helptrace_buffer =
-		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
-		dtrace_helptrace_next = 0;
-	}
-
-	/*
 	 * If there are already providers, we must ask them to provide their
 	 * probes, and then match any anonymous enabling against them.  Note
 	 * that there should be no other retained enablings at this time:
@@ -14981,6 +15191,18 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 		return (EBUSY);
 	}
 
+	if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
+		/*
+		 * If DTrace helper tracing is enabled, we need to allocate the
+		 * trace buffer and initialize the values.
+		 */
+		dtrace_helptrace_buffer =
+		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
+		dtrace_helptrace_next = 0;
+		dtrace_helptrace_wrapped = 0;
+		dtrace_helptrace_enable = 0;
+	}
+
 	state = dtrace_state_create(devp, cred_p);
 	mutex_exit(&cpu_lock);
 
@@ -15002,6 +15224,7 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 {
 	minor_t minor = getminor(dev);
 	dtrace_state_t *state;
+	dtrace_helptrace_t *buf = NULL;
 
 	if (minor == DTRACEMNRN_HELPER)
 		return (0);
@@ -15019,6 +15242,18 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 		dtrace_state_destroy(state->dts_anon);
 	}
 
+	if (dtrace_helptrace_disable) {
+		/*
+		 * If we have been told to disable helper tracing, set the
+		 * buffer to NULL before calling into dtrace_state_destroy();
+		 * we take advantage of its dtrace_sync() to know that no
+		 * CPU is in probe context with enabled helper tracing
+		 * after it returns.
+		 */
+		buf = dtrace_helptrace_buffer;
+		dtrace_helptrace_buffer = NULL;
+	}
+
 	dtrace_state_destroy(state);
 	ASSERT(dtrace_opens > 0);
 
@@ -15029,6 +15264,11 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 
+	if (buf != NULL) {
+		kmem_free(buf, dtrace_helptrace_bufsize);
+		dtrace_helptrace_disable = 0;
+	}
+
 	mutex_exit(&dtrace_lock);
 	mutex_exit(&cpu_lock);
 
@@ -15917,12 +16157,10 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	dtrace_modload = NULL;
 	dtrace_modunload = NULL;
 
-	mutex_exit(&cpu_lock);
+	ASSERT(dtrace_getf == 0);
+	ASSERT(dtrace_closef == NULL);
 
-	if (dtrace_helptrace_enabled) {
-		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
-		dtrace_helptrace_buffer = NULL;
-	}
+	mutex_exit(&cpu_lock);
 
 	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
 	dtrace_probes = NULL;
diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c
index 242185071b..157acc25fc 100644
--- a/usr/src/uts/common/dtrace/sdt_subr.c
+++ b/usr/src/uts/common/dtrace/sdt_subr.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/sdt_impl.h>
@@ -97,26 +98,26 @@ static dtrace_pattr_t iscsi_attr = {
 };
 
 sdt_provider_t sdt_providers[] = {
-	{ "vtrace", "__vtrace_", &vtrace_attr, 0 },
-	{ "sysinfo", "__cpu_sysinfo_", &info_attr, 0 },
-	{ "vminfo", "__cpu_vminfo_", &info_attr, 0 },
-	{ "fpuinfo", "__fpuinfo_", &fpu_attr, 0 },
-	{ "sched", "__sched_", &stab_attr, 0 },
-	{ "proc", "__proc_", &stab_attr, 0 },
-	{ "io", "__io_", &stab_attr, 0 },
-	{ "ip", "__ip_", &stab_attr, 0 },
-	{ "tcp", "__tcp_", &stab_attr, 0 },
-	{ "udp", "__udp_", &stab_attr, 0 },
-	{ "mib", "__mib_", &stab_attr, 0 },
-	{ "fsinfo", "__fsinfo_", &fsinfo_attr, 0 },
-	{ "iscsi", "__iscsi_", &iscsi_attr, 0 },
-	{ "nfsv3", "__nfsv3_", &stab_attr, 0 },
-	{ "nfsv4", "__nfsv4_", &stab_attr, 0 },
-	{ "xpv", "__xpv_", &xpv_attr, 0 },
-	{ "fc", "__fc_", &fc_attr, 0 },
-	{ "srp", "__srp_", &fc_attr, 0 },
-	{ "sysevent", "__sysevent_", &stab_attr, 0 },
-	{ "sdt", NULL, &sdt_attr, 0 },
+	{ "vtrace", "__vtrace_", &vtrace_attr },
+	{ "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
+	{ "vminfo", "__cpu_vminfo_", &info_attr, DTRACE_PRIV_USER },
+	{ "fpuinfo", "__fpuinfo_", &fpu_attr },
+	{ "sched", "__sched_", &stab_attr, DTRACE_PRIV_USER },
+	{ "proc", "__proc_", &stab_attr, DTRACE_PRIV_USER },
+	{ "io", "__io_", &stab_attr },
+	{ "ip", "__ip_", &stab_attr },
+	{ "tcp", "__tcp_", &stab_attr },
+	{ "udp", "__udp_", &stab_attr },
+	{ "mib", "__mib_", &stab_attr },
+	{ "fsinfo", "__fsinfo_", &fsinfo_attr },
+	{ "iscsi", "__iscsi_", &iscsi_attr },
+	{ "nfsv3", "__nfsv3_", &stab_attr },
+	{ "nfsv4", "__nfsv4_", &stab_attr },
+	{ "xpv", "__xpv_", &xpv_attr },
+	{ "fc", "__fc_", &fc_attr },
+	{ "srp", "__srp_", &fc_attr },
+	{ "sysevent", "__sysevent_", &stab_attr },
+	{ "sdt", NULL, &sdt_attr },
 	{ NULL }
 };
 
@@ -1155,6 +1156,20 @@ sdt_argdesc_t sdt_args[] = {
 };
 
 /*ARGSUSED*/
+int
+sdt_mode(void *arg, dtrace_id_t id, void *parg)
+{
+	/*
+	 * We tell DTrace that we're in kernel mode, that the firing needs to
+	 * be dropped for anything that doesn't have necessary privileges, and
+	 * that it needs to be restricted for anything that has restricted
+	 * (i.e., not all-zone) privileges.
+	 */
+	return (DTRACE_MODE_KERNEL | DTRACE_MODE_NOPRIV_DROP |
+	    DTRACE_MODE_LIMITEDPRIV_RESTRICT);
+}
+
+/*ARGSUSED*/
 void
 sdt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
 {
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index b814175e8a..f29918e351 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -537,7 +538,7 @@ static struct sdev_vop_table vtab[] =
 	SDEV_DYNAMIC | SDEV_VTOR },
 
 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
-	devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
+	devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
 
 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
 
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index fb1d93d06b..89c5decbf0 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -1142,9 +1142,21 @@ sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
 	struct sdev_node *parent = VTOSDEV(dvp);
 	int error;
 
-	/* execute access is required to search the directory */
-	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
-		return (error);
+	/*
+	 * We must check that we have execute access to search the directory --
+	 * but because our sdev_contents lock is already held as a reader (the
+	 * caller must have done a VOP_RWLOCK()), we call directly into the
+	 * underlying access routine if sdev_attr is non-NULL.
+	 */
+	if (parent->sdev_attr != NULL) {
+		VERIFY(RW_READ_HELD(&parent->sdev_contents));
+
+		if (sdev_unlocked_access(parent, VEXEC, cred) != 0)
+			return (EACCES);
+	} else {
+		if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+			return (error);
+	}
 
 	ASSERT(parent);
 	if (!SDEV_IS_GLOBAL(parent))
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index 89ce67dd68..09e5559701 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /* vnode ops for the /dev/zvol directory */
@@ -47,6 +48,7 @@ static ldi_ident_t devzvol_li;
 static ldi_handle_t devzvol_lh;
 static kmutex_t devzvol_mtx;
 static boolean_t devzvol_isopen;
+static major_t devzvol_major;
 
 /*
  * we need to use ddi_mod* since fs/dev gets loaded early on in
@@ -61,12 +63,16 @@ int (*szn2m)(char *, minor_t *);
 int
 sdev_zvol_create_minor(char *dsname)
 {
+	if (szcm == NULL)
+		return (-1);
 	return ((*szcm)(dsname));
 }
 
 int
 sdev_zvol_name2minor(char *dsname, minor_t *minor)
 {
+	if (szn2m == NULL)
+		return (-1);
 	return ((*szn2m)(dsname, minor));
 }
 
@@ -74,6 +80,7 @@ int
 devzvol_open_zfs()
 {
 	int rc;
+	dev_t dv;
 
 	devzvol_li = ldi_ident_from_anon();
 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
@@ -94,6 +101,9 @@ devzvol_open_zfs()
 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
 		return (rc);
 	}
+	if (ldi_get_dev(devzvol_lh, &dv))
+		return (-1);
+	devzvol_major = getmajor(dv);
 	return (0);
 }
 
@@ -270,6 +280,8 @@ devzvol_validate(struct sdev_node *dv)
 	sdcmn_err13(("  v_type %d do_type %d",
 	    SDEVTOV(dv)->v_type, do_type));
 	if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
+	    ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) &&
+	    do_type != DMU_OST_ZVOL) ||
 	    (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
 		kmem_free(dsname, strlen(dsname) + 1);
 		return (SDEV_VTOR_STALE);
@@ -486,6 +498,82 @@ devzvol_prunedir(struct sdev_node *ddv)
 	rw_downgrade(&ddv->sdev_contents);
 }
 
+/*
+ * This function is used to create a dir or dev inside a zone's /dev when the
+ * zone has a zvol that is dynamically created within the zone (i.e. inside
+ * of a delegated dataset.  Since there is no /devices tree within a zone,
+ * we create the chr/blk devices directly inside the zone's /dev instead of
+ * making symlinks.
+ */
+static int
+devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
+{
+	struct vattr vattr;
+	timestruc_t now;
+	enum vtype expected_type = VDIR;
+	dmu_objset_type_t do_type;
+	struct sdev_node *dv = NULL;
+	int res;
+	char *dsname;
+
+	bzero(&vattr, sizeof (vattr));
+	gethrestime(&now);
+	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
+	vattr.va_uid = SDEV_UID_DEFAULT;
+	vattr.va_gid = SDEV_GID_DEFAULT;
+	vattr.va_type = VNON;
+	vattr.va_atime = now;
+	vattr.va_mtime = now;
+	vattr.va_ctime = now;
+
+	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
+		return (ENOENT);
+
+	if (devzvol_objset_check(dsname, &do_type) != 0) {
+		kmem_free(dsname, strlen(dsname) + 1);
+		return (ENOENT);
+	}
+	if (do_type == DMU_OST_ZVOL)
+		expected_type = VBLK;
+
+	if (expected_type == VDIR) {
+		vattr.va_type = VDIR;
+		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
+	} else {
+		minor_t minor;
+		dev_t devnum;
+		int rc;
+
+		rc = sdev_zvol_create_minor(dsname);
+		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
+		    sdev_zvol_name2minor(dsname, &minor)) {
+			kmem_free(dsname, strlen(dsname) + 1);
+			return (ENOENT);
+		}
+
+		devnum = makedevice(devzvol_major, minor);
+		vattr.va_rdev = devnum;
+
+		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
+			vattr.va_type = VCHR;
+		else
+			vattr.va_type = VBLK;
+		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
+	}
+	kmem_free(dsname, strlen(dsname) + 1);
+
+	rw_enter(&parent->sdev_contents, RW_WRITER);
+
+	res = sdev_mknode(parent, nm, &dv, &vattr,
+	    NULL, NULL, kcred, SDEV_READY);
+	rw_exit(&parent->sdev_contents);
+	if (res != 0)
+		return (ENOENT);
+
+	SDEV_RELE(dv);
+	return (0);
+}
+
 /*ARGSUSED*/
 static int
 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
@@ -505,9 +593,39 @@ devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
 		return (error);
 
 	rw_enter(&parent->sdev_contents, RW_READER);
-	if (!SDEV_IS_GLOBAL(parent)) {
+	if (SDEV_IS_GLOBAL(parent)) {
+		/*
+		 * During iter_datasets, don't create GZ dev when running in
+		 * NGZ.  We can't return ENOENT here since that could
+		 * incorrectly trigger the creation of the dev from the
+		 * recursive call through prof_filldir during iter_datasets.
+		 */
+		if (getzoneid() != GLOBAL_ZONEID) {
+			rw_exit(&parent->sdev_contents);
+			return (EPERM);
+		}
+	} else {
+		int res;
+
 		rw_exit(&parent->sdev_contents);
-		return (prof_lookup(dvp, nm, vpp, cred));
+		res = prof_lookup(dvp, nm, vpp, cred);
+
+		/*
+		 * We won't find a zvol that was dynamically created inside
+		 * a NGZ, within a delegated dataset, in the zone's dev profile
+		 * but prof_lookup will also find it via sdev_cache_lookup.
+		 */
+		if (res == ENOENT) {
+			/*
+			 * We have to create the sdev node for the dymamically
+			 * created zvol.
+			 */
+			if (devzvol_mk_ngz_node(parent, nm) != 0)
+				return (ENOENT);
+			res = prof_lookup(dvp, nm, vpp, cred);
+		}
+
+		return (res);
 	}
 
 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
@@ -613,8 +731,10 @@ sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
 		} else if (rc == ENOENT) {
 			goto skip;
 		} else {
-			/* EBUSY == problem with zvols's dmu holds? */
-			ASSERT(0);
+			/*
+			 * EBUSY == problem with zvols's dmu holds?
+			 * EPERM when in a NGZ and traversing up and out.
+			 */
 			goto skip;
 		}
 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..16068e35ee
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+		vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define	HL_HASH_SIZE	8192		/* must be power of 2 */
+#define	HL_MUTEX_SIZE	64
+
+static hldirent_t	*hl_hashtable[HL_HASH_SIZE];
+static kmutex_t		 hl_hashmutex[HL_MUTEX_SIZE];
+
+#define	HL_HASH_INDEX(a)	((a) & (HL_HASH_SIZE-1))
+#define	HL_MUTEX_INDEX(a)	((a) & (HL_MUTEX_SIZE-1))
+
+#define	HYPRLOFS_HASH(tp, name, hash)				\
+	{							\
+		char Xc, *Xcp;					\
+		hash = (uint_t)(uintptr_t)(tp) >> 8;		\
+		for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)	\
+			hash = (hash << 4) + hash + (uint_t)Xc;	\
+	}
+
+void
+hyprlofs_hash_init(void)
+{
+	int	ix;
+
+	for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+		mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+	h->hld_hash = hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	h->hld_link = *prevpp;
+	*prevpp = h;
+	mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+	uint_t		hash;
+	hldirent_t	**prevpp;
+	kmutex_t	*hmtx;
+
+	hash = h->hld_hash;
+	prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	while (*prevpp != h)
+		prevpp = &(*prevpp)->hld_link;
+	*prevpp = h->hld_link;
+	mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+    hlnode_t **found)
+{
+	hldirent_t	*l;
+	uint_t		hash;
+	kmutex_t	*hmtx;
+	hlnode_t	*hnp;
+
+	HYPRLOFS_HASH(parent, name, hash);
+	hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+	mutex_enter(hmtx);
+	l = hl_hashtable[HL_HASH_INDEX(hash)];
+	while (l) {
+		if (l->hld_hash == hash && l->hld_parent == parent &&
+		    strcmp(l->hld_name, name) == 0) {
+			/*
+			 * Ensure that the hlnode that we put a hold on is the
+			 * same one that we pass back. Thus the temp. var
+			 * hnp is necessary.
+			 */
+			hnp = l->hld_hlnode;
+			if (hold) {
+				ASSERT(hnp);
+				hlnode_hold(hnp);
+			}
+			if (found)
+				*found = hnp;
+			mutex_exit(hmtx);
+			return (l);
+		} else {
+			l = l->hld_link;
+		}
+	}
+	mutex_exit(hmtx);
+	return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+	int error;
+
+	*foundtp = NULL;
+	if (parent->hln_type != VDIR)
+		return (ENOTDIR);
+
+	if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+		return (error);
+
+	if (*name == '\0') {
+		hlnode_hold(parent);
+		*foundtp = parent;
+		return (0);
+	}
+
+	/*
+	 * Search the directory for the matching name. We need the lock
+	 * protecting the hln_dir list so that it doesn't change out from
+	 * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+	 * with a hold on it.
+	 */
+	if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+		ASSERT(*foundtp);
+		return (0);
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+	hlfsmount_t	*hm,
+	hlnode_t	*dir,		/* target directory to make entry in */
+	char		*name,		/* name of entry */
+	enum de_op	op,		/* entry operation */
+	vnode_t		*realvp,	/* real vnode */
+	vattr_t		*va,
+	hlnode_t	**hpp,		/* return hlnode */
+	cred_t		*cr)
+{
+	hldirent_t *hdp;
+	hlnode_t *found = NULL;
+	hlnode_t *hp;
+	int error = 0;
+	char *s;
+
+	/* hln_rwlock is held to serialize direnter and dirdeletes */
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	/* Don't allow '/' characters in pathname component */
+	for (s = name; *s; s++)
+		if (*s == '/')
+			return (EACCES);
+
+	if (name[0] == '\0')
+		panic("hyprlofs_direnter: NULL name");
+
+	/*
+	 * This might be a "dangling detached directory". It could have been
+	 * removed, but a reference to it kept in u_cwd. Don't bother searching
+	 * it, and with any luck the user will get tired of dealing with us and
+	 * cd to some absolute pathway. This is in ufs, too.
+	 */
+	if (dir->hln_nlink == 0) {
+		return (ENOENT);
+	}
+
+	/* Search for the entry.  Return "found" if it exists. */
+	hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+	if (hdp) {
+		ASSERT(found);
+		switch (op) {
+		case DE_CREATE:
+		case DE_MKDIR:
+			if (hpp) {
+				*hpp = found;
+				error = EEXIST;
+			} else {
+				hlnode_rele(found);
+			}
+			break;
+		}
+	} else {
+
+		/*
+		 * The entry does not exist. Check write perms in dir to see if
+		 * entry can be created.
+		 */
+		if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+			return (error);
+
+		/* Make new hlnode and directory entry as required. */
+		if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+		    cr)))
+			return (error);
+
+		if ((error = hldiraddentry(dir, hp, name))) {
+			/* Unmake the inode we just made. */
+			rw_enter(&hp->hln_rwlock, RW_WRITER);
+			if ((hp->hln_type) == VDIR) {
+				ASSERT(hdp == NULL);
+				/* cleanup allocs made by hyprlofs_dirinit() */
+				hyprlofs_dirtrunc(hp);
+			}
+			mutex_enter(&hp->hln_tlock);
+			hp->hln_nlink = 0;
+			mutex_exit(&hp->hln_tlock);
+			gethrestime(&hp->hln_ctime);
+			rw_exit(&hp->hln_rwlock);
+			hlnode_rele(hp);
+			hp = NULL;
+		} else if (hpp) {
+			*hpp = hp;
+		} else {
+			hlnode_rele(hp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+    cred_t *cr)
+{
+	hldirent_t *hpdp;
+	int error;
+	size_t namelen;
+	hlnode_t *hnp;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (nm[0] == '\0')
+		panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+	/* return error if removing . or .. */
+	if (nm[0] == '.') {
+		if (nm[1] == '\0')
+			return (EINVAL);
+		if (nm[1] == '.' && nm[2] == '\0')
+			return (EEXIST); /* thus in ufs */
+	}
+
+	if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+		return (error);
+
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+	if (hpdp == NULL) {
+		/*
+		 * If it is gone, some other thread got here first!
+		 * Return error ENOENT.
+		 */
+		return (ENOENT);
+	}
+
+	/*
+	 * If the hlnode in the hldirent changed (shouldn't happen since we
+	 * don't support rename) then original is gone, so return that status
+	 * (same as UFS).
+	 */
+	if (hp != hnp)
+		return (ENOENT);
+
+	hyprlofs_hash_out(hpdp);
+
+	/* Take hpdp out of the directory list. */
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+	if (hpdp->hld_prev) {
+		hpdp->hld_prev->hld_next = hpdp->hld_next;
+	}
+	if (hpdp->hld_next) {
+		hpdp->hld_next->hld_prev = hpdp->hld_prev;
+	}
+
+	/*
+	 * If the roving slot pointer happens to match hpdp, point it at the
+	 * previous dirent.
+	 */
+	if (dir->hln_dir->hld_prev == hpdp) {
+		dir->hln_dir->hld_prev = hpdp->hld_prev;
+	}
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	/* hpdp points to the correct directory entry */
+	namelen = strlen(hpdp->hld_name) + 1;
+
+	hyprlofs_memfree(hpdp, sizeof (hldirent_t) + namelen);
+	dir->hln_size -= (sizeof (hldirent_t) + namelen);
+	dir->hln_dirents--;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+	hp->hln_ctime = now;
+
+	ASSERT(hp->hln_nlink > 0);
+	DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+	if (op == DR_RMDIR && hp->hln_type == VDIR) {
+		hyprlofs_dirtrunc(hp);
+		ASSERT(hp->hln_nlink == 0);
+	}
+	return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+	hlnode_t *parent,	/* parent of directory to initialize */
+	hlnode_t *dir)		/* the new directory */
+{
+	hldirent_t *dot, *dotdot;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	dot = hyprlofs_memalloc(sizeof (hldirent_t) + 2, HL_MUSTHAVE);
+	dotdot = hyprlofs_memalloc(sizeof (hldirent_t) + 3, HL_MUSTHAVE);
+
+	/* Initialize the entries */
+	dot->hld_hlnode = dir;
+	dot->hld_offset = 0;
+	dot->hld_name = (char *)dot + sizeof (hldirent_t);
+	dot->hld_name[0] = '.';
+	dot->hld_parent = dir;
+	hyprlofs_hash_in(dot);
+
+	dotdot->hld_hlnode = parent;
+	dotdot->hld_offset = 1;
+	dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+	dotdot->hld_name[0] = '.';
+	dotdot->hld_name[1] = '.';
+	dotdot->hld_parent = dir;
+	hyprlofs_hash_in(dotdot);
+
+	/* Initialize directory entry list. */
+	dot->hld_next = dotdot;
+	dot->hld_prev = dotdot;
+	dotdot->hld_next = NULL;
+	dotdot->hld_prev = dot;
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	/*
+	 * Since hyprlofs_dirinit is called with both dir and parent being the
+	 * same for the root vnode, we need to increment this before we set
+	 * hln_nlink = 2 below.
+	 */
+	INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+	parent->hln_ctime = now;
+
+	dir->hln_dir = dot;
+	dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+	dir->hln_dirents = 2;
+	dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+	hldirent_t *hdp;
+	hlnode_t *tp;
+	size_t namelen;
+	timestruc_t now;
+
+	ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+	ASSERT(dir->hln_type == VDIR);
+
+	if (dir->hln_looped)
+		return;
+
+	for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+		ASSERT(hdp->hld_next != hdp);
+		ASSERT(hdp->hld_prev != hdp);
+		ASSERT(hdp->hld_hlnode);
+
+		dir->hln_dir = hdp->hld_next;
+		namelen = strlen(hdp->hld_name) + 1;
+
+		/*
+		 * Adjust the link counts to account for this dir entry removal.
+		 */
+		tp = hdp->hld_hlnode;
+
+		ASSERT(tp->hln_nlink > 0);
+		DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+		hyprlofs_hash_out(hdp);
+
+		hyprlofs_memfree(hdp, sizeof (hldirent_t) + namelen);
+		dir->hln_size -= (sizeof (hldirent_t) + namelen);
+		dir->hln_dirents--;
+	}
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	ASSERT(dir->hln_dir == NULL);
+	ASSERT(dir->hln_size == 0);
+	ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+    hlnode_t	*dir,	/* target directory to make entry in */
+    hlnode_t	*hp,	/* new hlnode */
+    char	*name)
+{
+	hldirent_t	*hdp, *hpdp;
+	size_t		namelen, alloc_size;
+	timestruc_t	now;
+
+	/*
+	 * Make sure the parent dir wasn't removed from underneath the caller.
+	 */
+	if (dir->hln_dir == NULL)
+		return (ENOENT);
+
+	/* Check that everything is on the same FS. */
+	if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+		return (EXDEV);
+
+	/* Alloc and init dir entry */
+	namelen = strlen(name) + 1;
+	alloc_size = namelen + sizeof (hldirent_t);
+	hdp = hyprlofs_memalloc(alloc_size, 0);
+	if (hdp == NULL)
+		return (ENOSPC);
+
+	dir->hln_size += alloc_size;
+	dir->hln_dirents++;
+	hdp->hld_hlnode = hp;
+	hdp->hld_parent = dir;
+
+	/* The dir entry and its name were allocated sequentially. */
+	hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+	(void) strcpy(hdp->hld_name, name);
+
+	hyprlofs_hash_in(hdp);
+
+	/*
+	 * Some utilities expect the size of a directory to remain fairly
+	 * static.  For example, a routine which unlinks files between calls to
+	 * readdir(); the size of the dir changes from underneath it and so the
+	 * real dir offset in bytes is invalid.  To circumvent this problem, we
+	 * initialize a dir entry with a phony offset, and use this offset to
+	 * determine end of file in hyprlofs_readdir.
+	 */
+	hpdp = dir->hln_dir->hld_prev;
+	/*
+	 * Install at first empty "slot" in directory list.
+	 */
+	while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+	    hpdp->hld_offset) <= 1) {
+		ASSERT(hpdp->hld_next != hpdp);
+		ASSERT(hpdp->hld_prev != hpdp);
+		ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+		hpdp = hpdp->hld_next;
+	}
+	hdp->hld_offset = hpdp->hld_offset + 1;
+
+	/*
+	 * If we're at the end of the dirent list and the offset (which is
+	 * necessarily the largest offset in this dir) is more than twice the
+	 * number of dirents, that means the dir is 50% holes.  At this point
+	 * we reset the slot pointer back to the beginning of the dir so we
+	 * start using the holes. The idea is that if there are N dirents,
+	 * there must also be N holes, so we can satisfy the next N creates by
+	 * walking at most 2N entries; thus the average cost of a create is
+	 * constant. Note that we use the first dirent's hld_prev as the roving
+	 * slot pointer. This saves a word in every dirent.
+	 */
+	if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+		dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+	else
+		dir->hln_dir->hld_prev = hdp;
+
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	hdp->hld_next = hpdp->hld_next;
+	if (hdp->hld_next) {
+		hdp->hld_next->hld_prev = hdp;
+	}
+	hdp->hld_prev = hpdp;
+	hpdp->hld_next = hdp;
+
+	ASSERT(hdp->hld_next != hdp);
+	ASSERT(hdp->hld_prev != hdp);
+	ASSERT(hpdp->hld_next != hpdp);
+	ASSERT(hpdp->hld_prev != hpdp);
+
+	gethrestime(&now);
+	dir->hln_mtime = now;
+	dir->hln_ctime = now;
+
+	return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+    vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+	hlnode_t	*hp;
+	enum vtype	type;
+
+	ASSERT(va != NULL);
+	ASSERT(op == DE_CREATE || op == DE_MKDIR);
+	if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+	    ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+		return (EOVERFLOW);
+	type = va->va_type;
+	hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+	hyprlofs_node_init(hm, hp, va, cr);
+
+	hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+	hp->hln_vnode->v_type = type;
+	hp->hln_uid = crgetuid(cr);
+
+	/*
+	 * To determine the gid of the created file:
+	 *   If the directory's set-gid bit is set, set the gid to the gid
+	 *   of the parent dir, otherwise, use the process's gid.
+	 */
+	if (dir->hln_mode & VSGID)
+		hp->hln_gid = dir->hln_gid;
+	else
+		hp->hln_gid = crgetgid(cr);
+
+	/*
+	 * If we're creating a dir and the parent dir has the set-GID bit set,
+	 * set it on the new dir. Otherwise, if the user is neither privileged
+	 * nor a member of the file's new group, clear the file's set-GID bit.
+	 */
+	if (dir->hln_mode & VSGID && type == VDIR)
+		hp->hln_mode |= VSGID;
+	else {
+		if ((hp->hln_mode & VSGID) &&
+		    secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+			hp->hln_mode &= ~VSGID;
+	}
+
+	if (va->va_mask & AT_ATIME)
+		hp->hln_atime = va->va_atime;
+	if (va->va_mask & AT_MTIME)
+		hp->hln_mtime = va->va_mtime;
+
+	if (op == DE_MKDIR) {
+		hyprlofs_dirinit(dir, hp);
+		hp->hln_looped = 0;
+	} else {
+		hp->hln_realvp = realvp;
+		hp->hln_size = va->va_size;
+		hp->hln_looped = 1;
+	}
+
+	*newnode = hp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..bf71b2bfcb
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define	MODESHIFT	3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+	vnode_t *vp;
+	timestruc_t now;
+
+	ASSERT(vap != NULL);
+
+	rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+	h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	h->hln_mask = 0;
+	h->hln_type = vap->va_type;
+	h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+	h->hln_nlink = 1;
+	h->hln_size = 0;
+
+	if (cr == NULL) {
+		h->hln_uid = vap->va_uid;
+		h->hln_gid = vap->va_gid;
+	} else {
+		h->hln_uid = crgetuid(cr);
+		h->hln_gid = crgetgid(cr);
+	}
+
+	h->hln_fsid = hm->hlm_dev;
+	h->hln_rdev = vap->va_rdev;
+	h->hln_blksize = PAGESIZE;
+	h->hln_nblocks = 0;
+	gethrestime(&now);
+	h->hln_atime = now;
+	h->hln_mtime = now;
+	h->hln_ctime = now;
+	h->hln_seq = 0;
+	h->hln_dir = NULL;
+
+	h->hln_vnode = vn_alloc(KM_SLEEP);
+	vp = HLNTOV(h);
+	vn_setops(vp, hyprlofs_vnodeops);
+	vp->v_vfsp = hm->hlm_vfsp;
+	vp->v_type = vap->va_type;
+	vp->v_rdev = vap->va_rdev;
+	vp->v_data = (caddr_t)h;
+	mutex_enter(&hm->hlm_contents);
+	/*
+	 * Increment the pseudo generation number for this hlnode. Since
+	 * hlnodes are allocated and freed, there really is no particular
+	 * generation number for a new hlnode.  Just fake it by using a
+	 * counter in each file system.
+	 */
+	h->hln_gen = hm->hlm_gen++;
+
+	/*
+	 * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+	 * Root dir is handled specially in hyprlofs_mount.
+	 */
+	if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+		h->hln_forw = NULL;
+		h->hln_back = hm->hlm_rootnode->hln_back;
+		h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+	}
+	mutex_exit(&hm->hlm_contents);
+	vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+	hlnode_t *hp = vtp;
+	int shift = 0;
+
+	/* Check access based on owner, group and public perms in hlnode. */
+	if (crgetuid(cr) != hp->hln_uid) {
+		shift += MODESHIFT;
+		if (groupmember(hp->hln_gid, cr) == 0)
+			shift += MODESHIFT;
+	}
+
+	return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+	    hp->hln_mode << shift, mode));
+}
+
+/*
+ * Allocate zeroed memory if hyprlofs_maxkmem has not been exceeded or the
+ * 'musthave' flag is set. 'musthave' allocations should always be subordinate
+ * to normal allocations so that hyprlofs_maxkmem can't be exceeded by more
+ * than a few KB.  E.g. when creating a new dir, the hlnode is a normal
+ * allocation; if that succeeds, the dirents for "." and ".." are 'musthave'
+ * allocations.
+ */
+void *
+hyprlofs_memalloc(size_t size, int musthave)
+{
+	if (atomic_add_long_nv(&hyprlofs_kmemspace, size) < hyprlofs_maxkmem ||
+	    musthave)
+		return (kmem_zalloc(size, KM_SLEEP));
+
+	atomic_add_long(&hyprlofs_kmemspace, -size);
+	cmn_err(CE_WARN, "hyprlofs over memory limit");
+	return (NULL);
+}
+
+void
+hyprlofs_memfree(void *cp, size_t size)
+{
+	kmem_free(cp, size);
+	atomic_add_long(&hyprlofs_kmemspace, -size);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..e8af803529
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,626 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems.  It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute.  Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses.  Entries in the name space are not mounts and
+ * thus do not appear in the mnttab.  Entries in the name space are allowed to
+ * refer to files on different backing file systems.  Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs.  There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications.  That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place.  It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space.  The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed.  The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /.  The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"hyprlofs",
+	hyprlofsinit,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+	&hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+	0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+	&mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	int error;
+
+	error = mod_remove(&modlinkage);
+	if (error)
+		return (error);
+	/*
+	 * Tear down the operations vectors
+	 */
+	(void) vfs_freevfsops_by_type(hyprlofsfstype);
+	vn_freevnodeops(hyprlofs_vnodeops);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace;	/* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t	hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+	static const fs_operation_def_t hl_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = hyprlofs_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = hyprlofs_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = hyprlofs_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = hyprlofs_statvfs },
+		VFSNAME_VGET,		{ .vfs_vget = hyprlofs_vget },
+		NULL,			NULL
+	};
+	int error;
+	extern  void    hyprlofs_hash_init();
+
+	hyprlofs_hash_init();
+	hyprlofsfstype = fstype;
+	ASSERT(hyprlofsfstype != 0);
+
+	error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+		return (error);
+	}
+
+	error = vn_make_ops(name, hyprlofs_vnodeops_template,
+	    &hyprlofs_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * hyprlofs_minfree is an absolute limit of swap space which still
+	 * allows other processes to execute.  Set it if its not patched.
+	 */
+	if (hyprlofs_minfree == 0)
+		hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+	/*
+	 * The maximum amount of space hyprlofs can allocate is
+	 * HYPRLOFSMAXPROCKMEM percent of kernel memory
+	 */
+	if (hyprlofs_maxkmem == 0)
+		hyprlofs_maxkmem =
+		    MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM);
+
+	if ((hyprlofs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "hyprlofsinit: Can't get unique device number.");
+		hyprlofs_major = 0;
+	}
+	mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	hlfsmount_t *hm = NULL;
+	hlnode_t *hp;
+	struct pathname dpn;
+	int error;
+	vattr_t rattr;
+	int got_attrs;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uap->flags & MS_REMOUNT)
+		return (EBUSY);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/* Having the resource be anything but "swap" doesn't make sense. */
+	vfs_setresource(vfsp, "swap", 0);
+
+	if ((error = pn_get(uap->dir,
+	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+	    &dpn)) != 0)
+		goto out;
+
+	if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) {
+		pn_free(&dpn);
+		error = ENOMEM;
+		goto out;
+	}
+
+	/* Get an available minor device number for this mount */
+	mutex_enter(&hyprlofs_minor_lock);
+	do {
+		hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+		hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+	} while (vfs_devismounted(hm->hlm_dev));
+	mutex_exit(&hyprlofs_minor_lock);
+
+	/*
+	 * Set but don't bother entering the mutex since hlfsmount is not on
+	 * the mount list yet.
+	 */
+	mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+	hm->hlm_vfsp = vfsp;
+
+	vfsp->vfs_data = (caddr_t)hm;
+	vfsp->vfs_fstype = hyprlofsfstype;
+	vfsp->vfs_dev = hm->hlm_dev;
+	vfsp->vfs_bsize = PAGESIZE;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+	hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE);
+	(void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+	/* allocate and initialize root hlnode structure */
+	bzero(&rattr, sizeof (vattr_t));
+	rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+	rattr.va_type = VDIR;
+	rattr.va_rdev = 0;
+	hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+	hyprlofs_node_init(hm, hp, &rattr, cr);
+
+	/* Get the mode, uid, and gid from the underlying mount point. */
+	rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+	HLNTOV(hp)->v_flag |= VROOT;
+
+	/*
+	 * If the getattr succeeded, use its results, otherwise allow the
+	 * previously set defaults to prevail.
+	 */
+	if (got_attrs == 0) {
+		hp->hln_mode = rattr.va_mode;
+		hp->hln_uid = rattr.va_uid;
+		hp->hln_gid = rattr.va_gid;
+	}
+
+	/*
+	 * Initialize linked list of hlnodes so that the back pointer of the
+	 * root hlnode always points to the last one on the list and the
+	 * forward pointer of the last node is null
+	 */
+	hp->hln_back = hp;
+	hp->hln_forw = NULL;
+	hp->hln_nlink = 0;
+	hm->hlm_rootnode = hp;
+
+	hyprlofs_dirinit(hp, hp);
+
+	rw_exit(&hp->hln_rwlock);
+
+	pn_free(&dpn);
+	error = 0;
+
+out:
+	return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hnp, *cancel;
+	vnode_t	*vp;
+	int error;
+
+	if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (error);
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	/*
+	 * forced unmount is not supported by this file system
+	 * and thus, ENOTSUP, is being returned.
+	 */
+	if (flag & MS_FORCE)
+		return (ENOTSUP);
+
+	mutex_enter(&hm->hlm_contents);
+
+	/*
+	 * If there are no open files, only the root node should have a ref cnt.
+	 * With hlm_contents held, nothing can be added or removed. There may
+	 * be some dirty pages.  To prevent fsflush from disrupting the unmount,
+	 * put a hold on each node while scanning. If we find a previously
+	 * referenced node, undo the holds we have placed and fail EBUSY.
+	 */
+	hnp = hm->hlm_rootnode;
+	if (HLNTOV(hnp)->v_count > 1) {
+		mutex_exit(&hm->hlm_contents);
+		return (EBUSY);
+	}
+
+	for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+		if ((vp = HLNTOV(hnp))->v_count > 0) {
+			cancel = hm->hlm_rootnode->hln_forw;
+			while (cancel != hnp) {
+				vp = HLNTOV(cancel);
+				ASSERT(vp->v_count > 0);
+				VN_RELE(vp);
+				cancel = cancel->hln_forw;
+			}
+			mutex_exit(&hm->hlm_contents);
+			return (EBUSY);
+		}
+		VN_HOLD(vp);
+	}
+
+	/* We can drop the mutex now because no one can find this mount */
+	mutex_exit(&hm->hlm_contents);
+
+	/*
+	 * Free all alloc'd memory associated with this FS. To do this, we go
+	 * through the file list twice, once to remove all the dir entries, and
+	 * then to remove all the files.
+	 */
+
+	/* Remove all directory entries */
+	for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+		rw_enter(&hnp->hln_rwlock, RW_WRITER);
+		if (hnp->hln_type == VDIR)
+			hyprlofs_dirtrunc(hnp);
+		rw_exit(&hnp->hln_rwlock);
+	}
+
+	ASSERT(hm->hlm_rootnode);
+
+	/*
+	 * All links are gone, v_count is keeping nodes in place. VN_RELE
+	 * should make the node disappear, unless somebody is holding pages
+	 * against it.  Wait and retry until it disappears.
+	 *
+	 * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+	 * from blowing it away (in hyprlofs_inactive) while we're trying to
+	 * get to it here. Once we have a HOLD on it we know it'll stick around.
+	 */
+	mutex_enter(&hm->hlm_contents);
+
+	/* Remove all the files (except the rootnode) backwards. */
+	while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+		mutex_exit(&hm->hlm_contents);
+		/* Note we handled the link count in pass 2 above. */
+		vp = HLNTOV(hnp);
+		VN_RELE(vp);
+		mutex_enter(&hm->hlm_contents);
+		/*
+		 * It's still there after the RELE. Someone else like pageout
+		 * has a hold on it so wait a bit and then try again.
+		 */
+		if (hnp == hm->hlm_rootnode->hln_back) {
+			VN_HOLD(vp);
+			mutex_exit(&hm->hlm_contents);
+			delay(hz / 4);
+			mutex_enter(&hm->hlm_contents);
+		}
+	}
+	mutex_exit(&hm->hlm_contents);
+
+	VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+	ASSERT(hm->hlm_mntpath);
+
+	hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+	mutex_destroy(&hm->hlm_contents);
+	hyprlofs_memfree(hm, sizeof (hlfsmount_t));
+
+	return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = hm->hlm_rootnode;
+	vnode_t *vp;
+
+	ASSERT(hp);
+
+	vp = HLNTOV(hp);
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	ulong_t	blocks;
+	dev32_t d32;
+	zoneid_t eff_zid;
+	struct zone *zp;
+
+	/*
+	 * The FS may have been mounted by the GZ on behalf of the NGZ.  In
+	 * that case, the hlfsmount zone_id will be the global zone.  We want
+	 * to show the swap cap inside the zone in this case, even though the
+	 * FS was mounted by the GZ.
+	 */
+	if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+		zp = curproc->p_zone;
+	else
+		zp = hm->hlm_vfsp->vfs_zone;
+
+	if (zp == NULL)
+		eff_zid = GLOBAL_ZONEUNIQID;
+	else
+		eff_zid = zp->zone_id;
+
+	sbp->f_bsize = PAGESIZE;
+	sbp->f_frsize = PAGESIZE;
+
+	/*
+	 * Find the amount of available physical and memory swap
+	 */
+	mutex_enter(&anoninfo_lock);
+	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+	blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+	mutex_exit(&anoninfo_lock);
+
+	if (blocks > hyprlofs_minfree)
+		sbp->f_bfree = blocks - hyprlofs_minfree;
+	else
+		sbp->f_bfree = 0;
+
+	sbp->f_bavail = sbp->f_bfree;
+
+	/*
+	 * Total number of blocks is what's available plus what's been used
+	 */
+	sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+	if (eff_zid != GLOBAL_ZONEUNIQID &&
+	    zp->zone_max_swap_ctl != UINT64_MAX) {
+		/*
+		 * If the fs is used by a NGZ with a swap cap, then report the
+		 * capped size.
+		 */
+		rctl_qty_t cap, used;
+		pgcnt_t pgcap, pgused;
+
+		mutex_enter(&zp->zone_mem_lock);
+		cap = zp->zone_max_swap_ctl;
+		used = zp->zone_max_swap;
+		mutex_exit(&zp->zone_mem_lock);
+
+		pgcap = btop(cap);
+		pgused = btop(used);
+
+		sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+		sbp->f_bavail = sbp->f_bfree;
+		sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+	}
+
+	/*
+	 * This is fairly inaccurate since it doesn't take into account the
+	 * names stored in the directory entries.
+	 */
+	if (hyprlofs_maxkmem > hyprlofs_kmemspace)
+		sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) /
+		    (sizeof (hlnode_t) + sizeof (hldirent_t));
+	else
+		sbp->f_ffree = 0;
+
+	sbp->f_files = hyprlofs_maxkmem /
+	    (sizeof (hlnode_t) + sizeof (hldirent_t));
+	sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sbp->f_fsid = d32;
+	(void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+	(void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+	/*
+	 * ensure null termination
+	 */
+	sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+	sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sbp->f_namemax = MAXNAMELEN - 1;
+	return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+	hlfid_t *hfid;
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+	hlnode_t *hp = NULL;
+
+	hfid = (hlfid_t *)fidp;
+	*vpp = NULL;
+
+	mutex_enter(&hm->hlm_contents);
+	for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+		mutex_enter(&hp->hln_tlock);
+		if (hp->hln_nodeid == hfid->hlfid_ino) {
+			/*
+			 * If the gen numbers don't match we know the file
+			 * won't be found since only one hlnode can have this
+			 * number at a time.
+			 */
+			if (hp->hln_gen != hfid->hlfid_gen ||
+			    hp->hln_nlink == 0) {
+				mutex_exit(&hp->hln_tlock);
+				mutex_exit(&hm->hlm_contents);
+				return (0);
+			}
+			*vpp = (vnode_t *)HLNTOV(hp);
+
+			VN_HOLD(*vpp);
+
+			if ((hp->hln_mode & S_ISVTX) &&
+			    !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+				mutex_enter(&(*vpp)->v_lock);
+				(*vpp)->v_flag |= VISSWAP;
+				mutex_exit(&(*vpp)->v_lock);
+			}
+			mutex_exit(&hp->hln_tlock);
+			mutex_exit(&hm->hlm_contents);
+			return (0);
+		}
+		mutex_exit(&hp->hln_tlock);
+	}
+	mutex_exit(&hm->hlm_contents);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..b382210334
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1412 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+		caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+		int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+		int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call.  This is only used to validate that
+ * the input list looks sane.
+ */
+#define	MAX_IOCTL_PARAMS	100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t *rvp;
+	int error;
+
+	rvp = REALVP(*vpp);
+
+	if (VTOHLN(*vpp)->hln_looped == 0)
+		return (0);
+
+	/*
+	 * looped back, pass through to real vnode. Need to hold new reference
+	 * to vp since VOP_OPEN() may decide to release it.
+	 */
+	VN_HOLD(rvp);
+	error = VOP_OPEN(&rvp, flag, cr, ct);
+	ASSERT(rvp->v_count > 1);
+	VN_RELE(rvp);
+
+	return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0) {
+		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+		cleanshares(vp, ttoproc(curthread)->p_pid);
+		return (0);
+	}
+
+	return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	/* We don't support writing to non-regular files */
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	if (vn_is_readonly(vp))
+		return (EROFS);
+
+	return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+    cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+	int len, cnt, error;
+	int i;
+	model_t model;
+	char path[MAXPATHLEN];
+	char nm[MAXPATHLEN];
+
+	/* We only support the hyprlofs ioctls on the root vnode */
+	if (!(vp->v_flag & VROOT))
+		return (ENOTTY);
+
+	/*
+	 * Check if managing hyprlofs is allowed.
+	 */
+	if (secpolicy_hyprlofs_control(cr) != 0)
+		return (EPERM);
+
+	if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+		model = get_udatamodel();
+
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_entries_t ebuf;
+			hyprlofs_entry_t *e;
+
+			if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+				return (EFAULT);
+			cnt = ebuf.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry_t) * cnt;
+
+			e = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(ebuf.hle_entries), e, len)) {
+				kmem_free(e, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e[i].hle_nlen == 0 ||
+				    e[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+				    != 0) {
+					kmem_free(e, len);
+					return (EFAULT);
+				}
+				nm[e[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e[i].hle_plen == 0 ||
+					    e[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin(e[i].hle_path, path,
+					    e[i].hle_plen) != 0) {
+						kmem_free(e, len);
+						return (EFAULT);
+					}
+					path[e[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e, len);
+			return (0);
+
+		} else {
+			hyprlofs_entries32_t ebuf32;
+			hyprlofs_entry32_t *e32;
+
+			if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+				return (EFAULT);
+
+			cnt = ebuf32.hle_len;
+			if (cnt > MAX_IOCTL_PARAMS)
+				return (EINVAL);
+			len = sizeof (hyprlofs_entry32_t) * cnt;
+
+			e32 = kmem_alloc(len, KM_SLEEP);
+			if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+			    e32, len)) {
+				kmem_free(e32, len);
+				return (EFAULT);
+			}
+
+			for (i = 0; i < cnt; i++) {
+				if (e32[i].hle_nlen == 0 ||
+				    e32[i].hle_nlen > MAXPATHLEN)
+					return (EINVAL);
+
+				if (copyin((void *)(unsigned long)
+				    e32[i].hle_name, nm,
+				    e32[i].hle_nlen) != 0) {
+					kmem_free(e32, len);
+					return (EFAULT);
+				}
+				nm[e32[i].hle_nlen] = '\0';
+
+				if (cmd == HYPRLOFS_ADD_ENTRIES) {
+					if (e32[i].hle_plen == 0 ||
+					    e32[i].hle_plen > MAXPATHLEN)
+						return (EINVAL);
+
+					if (copyin((void *)(unsigned long)
+					    e32[i].hle_path, path,
+					    e32[i].hle_plen) != 0) {
+						kmem_free(e32, len);
+						return (EFAULT);
+					}
+					path[e32[i].hle_plen] = '\0';
+
+					if ((error = hyprlofs_add_entry(vp,
+					    path, nm, cr, ct)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				} else {
+					if ((error = hyprlofs_rm_entry(vp, nm,
+					    cr, ct, flag)) != 0) {
+						kmem_free(e32, len);
+						return (error);
+					}
+				}
+			}
+
+			kmem_free(e32, len);
+			return (0);
+		}
+	}
+
+	if (cmd == HYPRLOFS_RM_ALL) {
+		return (hyprlofs_rm_all(vp, cr, ct, flag));
+	}
+
+	if (cmd == HYPRLOFS_GET_ENTRIES) {
+		return (hyprlofs_get_all(vp, data, cr, ct, flag));
+	}
+
+	return (ENOTTY);
+}
+
+/*ARGSUSED2*/
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+
+	mutex_enter(&tp->hln_tlock);
+	vap->va_type = vp->v_type;
+	vap->va_mode = tp->hln_mode & MODEMASK;
+	vap->va_uid = tp->hln_uid;
+	vap->va_gid = tp->hln_gid;
+	vap->va_fsid = tp->hln_fsid;
+	vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+	vap->va_nlink = tp->hln_nlink;
+	vap->va_size = (u_offset_t)tp->hln_size;
+	vap->va_atime = tp->hln_atime;
+	vap->va_mtime = tp->hln_mtime;
+	vap->va_ctime = tp->hln_ctime;
+	vap->va_blksize = PAGESIZE;
+	vap->va_rdev = tp->hln_rdev;
+	vap->va_seq = tp->hln_seq;
+
+	vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+	mutex_exit(&tp->hln_tlock);
+	return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error = 0;
+	vattr_t *get;
+	long mask;
+
+	/*
+	 * Cannot set these attributes
+	 */
+	if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+		return (EINVAL);
+
+	mutex_enter(&tp->hln_tlock);
+
+	get = &tp->hln_attr;
+	/*
+	 * Change file access modes. Must be owner or have sufficient
+	 * privileges.
+	 */
+	error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+	    hyprlofs_taccess, tp);
+
+	if (error)
+		goto out;
+
+	mask = vap->va_mask;
+
+	if (mask & AT_MODE) {
+		get->va_mode &= S_IFMT;
+		get->va_mode |= vap->va_mode & ~S_IFMT;
+	}
+
+	if (mask & AT_UID)
+		get->va_uid = vap->va_uid;
+	if (mask & AT_GID)
+		get->va_gid = vap->va_gid;
+	if (mask & AT_ATIME)
+		get->va_atime = vap->va_atime;
+	if (mask & AT_MTIME)
+		get->va_mtime = vap->va_mtime;
+
+	if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+		gethrestime(&tp->hln_ctime);
+
+out:
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+	int error;
+
+	if (mode & VWRITE) {
+		if (vp->v_type == VREG && vn_is_readonly(vp))
+			return (EROFS);
+	}
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+	mutex_enter(&tp->hln_tlock);
+	error = hyprlofs_taccess(tp, mode, cr);
+	mutex_exit(&tp->hln_tlock);
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *ntp = NULL;
+	int error;
+
+	if (VTOHLN(dvp)->hln_looped == 1)
+		return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+		    cr, ct, direntflags, realpnp));
+
+	if (flags & LOOKUP_XATTR)
+		return (EINVAL);
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		*vpp = dvp;
+		return (0);
+	}
+	ASSERT(tp);
+
+	if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+		ASSERT(ntp);
+		*vpp = HLNTOV(ntp);
+	}
+	return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+    int mode, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *parent;
+	hlfsmount_t *tm;
+	int error;
+	hlnode_t *oldtp;
+	vnode_t *vp;
+
+	parent = (hlnode_t *)VTOHLN(dvp);
+	tm = (hlfsmount_t *)VTOHLM(dvp);
+	error = 0;
+	oldtp = NULL;
+
+	if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+		/* we don't support the sticky bit */
+		vap->va_mode &= ~VSVTX;
+	} else if (vap->va_type == VNON) {
+		return (EINVAL);
+	}
+
+	/* Null component name is a synonym for directory being searched. */
+	if (*nm == '\0') {
+		VN_HOLD(dvp);
+		oldtp = parent;
+	} else {
+		error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+	}
+
+	if (error == 0) {	/* name found */
+		ASSERT(oldtp);
+
+		rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+		/*
+		 * if create/read-only an existing directory, allow it
+		 */
+		if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+			error = EISDIR;
+		else {
+			error = hyprlofs_taccess(oldtp, mode, cr);
+		}
+
+		if (error) {
+			rw_exit(&oldtp->hln_rwlock);
+			hlnode_rele(oldtp);
+			return (error);
+		}
+
+		vp = HLNTOV(oldtp);
+		rw_exit(&oldtp->hln_rwlock);
+
+		if (vp->v_type == VREG) {
+			hlnode_rele(oldtp);
+			return (EEXIST);
+		}
+
+		vnevent_create(vp, ct);
+		return (0);
+	}
+
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+	    cr);
+	rw_exit(&parent->hln_rwlock);
+
+	return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+	int error;
+
+	/*
+	 * Might be dangling directory.  Catch it here, because a ENOENT return
+	 * from hyprlofs_dirlookup() is a valid return.
+	 */
+	if (parent->hln_nlink == 0)
+		return (ENOENT);
+
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error == 0) {
+		ASSERT(self);
+		hlnode_rele(self);
+		/* We can't loop in under a looped in directory */
+		if (self->hln_looped)
+			return (EACCES);
+		*vpp = HLNTOV(self);
+		return (EEXIST);
+	}
+	if (error != ENOENT)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+	    va, &self, cr);
+	rw_exit(&parent->hln_rwlock);
+
+	if (error == 0 || error == EEXIST) {
+		hlnode_rele(self);
+		*vpp = HLNTOV(self);
+	}
+
+	return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+    cred_t *cr, caller_context_t *ct)
+{
+	int error;
+	char *p, *pnm;
+	vnode_t *realvp, *dvp;
+	vattr_t va;
+
+	/*
+	 * Get vnode for the real file/dir. We'll have a hold on realvp which
+	 * we won't vn_rele until hyprlofs_inactive.
+	 */
+	if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &realvp)) != 0)
+		return (error);
+
+	/* no devices allowed */
+	if (IS_DEVVP(realvp)) {
+		VN_RELE(realvp);
+		return (ENODEV);
+	}
+
+	/*
+	 * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+	 * to trigger the mount of the intended filesystem. This causes a
+	 * loopback mount of the intended filesystem instead of the AUTOFS
+	 * filesystem.
+	 */
+	if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/*
+	 * We're interested in the top most filesystem. This is specially
+	 * important when fspath is a trigger AUTOFS node, since we're really
+	 * interested in mounting the filesystem AUTOFS mounted as result of
+	 * the VOP_ACCESS() call not the AUTOFS node itself.
+	 */
+	if (vn_mountedvfs(realvp) != NULL) {
+		if ((error = traverse(&realvp)) != 0) {
+			VN_RELE(realvp);
+			return (error);
+		}
+	}
+
+	va.va_type = VNON;
+	/*
+	 * If the target name is a path, make sure we have all of the
+	 * intermediate directories, creating them if necessary.
+	 */
+	dvp = vp;
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		if (va.va_type == VNON)
+			/* use the top-level dir as the template va for mkdir */
+			if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+				VN_RELE(realvp);
+				return (error);
+			}
+
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.')) {
+			VN_RELE(realvp);
+			return (EINVAL);
+		}
+
+		if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+		    error != EEXIST) {
+			VN_RELE(realvp);
+			return (error);
+		}
+
+		*p = '/';
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0') {
+		VN_RELE(realvp);
+		return (EINVAL);
+	}
+
+	/* Now use the real file's va as the template va */
+	if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+		VN_RELE(realvp);
+		return (error);
+	}
+
+	/* Make the vnode */
+	error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+	if (error != 0)
+		VN_RELE(realvp);
+	return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error;
+	char *p, *pnm;
+	hlnode_t *parent;
+	hlnode_t *fndtp;
+
+	pnm = p = fsname;
+
+	/* path cannot be absolute */
+	if (*p == '/')
+		return (EINVAL);
+
+	/*
+	 * If the target name is a path, get the containing dir and simple
+	 * file name.
+	 */
+	parent = (hlnode_t *)VTOHLN(dvp);
+	for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+		*p = '\0';
+
+		/* Path component cannot be empty or relative */
+		if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.'))
+			return (EINVAL);
+
+		if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+			return (error);
+
+		dvp = HLNTOV(fndtp);
+		parent = fndtp;
+		pnm = p + 1;
+	}
+
+	/* The file name is required */
+	if (pnm[0] == '\0')
+		return (EINVAL);
+
+	/* Remove the entry from the parent dir */
+	return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int error = 0;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively remove contents of this subdir */
+			if (fndhp->hln_type == VDIR) {
+				vnode_t *tvp = HLNTOV(fndhp);
+
+				error = hyprlofs_rm_all(tvp, cr, ct, flags);
+				if (error != 0)
+					goto done;
+			}
+		}
+
+		/* remove the entry */
+		error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+		if (error != 0)
+			goto done;
+
+		hdp = hp->hln_dir;
+	}
+
+done:
+	hlnode_rele(hp);
+	return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+    char *prefix, int *pcnt, int n_max,
+    cred_t *cr, caller_context_t *ct, int flags)
+{
+	int error = 0;
+	int too_big = 0;
+	int cnt;
+	int len;
+	hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+	hldirent_t *hdp;
+	char *path;
+
+	cnt = *pcnt;
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	hlnode_hold(hp);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		goto done;
+	}
+
+	hdp = hp->hln_dir;
+	while (hdp) {
+		hlnode_t *fndhp;
+		vnode_t *tvp;
+
+		if (strcmp(hdp->hld_name, ".") == 0 ||
+		    strcmp(hdp->hld_name, "..") == 0) {
+			hdp = hdp->hld_next;
+			continue;
+		}
+
+		/* This holds the fndhp vnode */
+		error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+		if (error != 0)
+			goto done;
+		hlnode_rele(fndhp);
+
+		if (fndhp->hln_looped == 0) {
+			/* recursively get contents of this subdir */
+			VERIFY(fndhp->hln_type == VDIR);
+			tvp = HLNTOV(fndhp);
+
+			if (*prefix == '\0')
+				(void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+			else
+				(void) snprintf(path, MAXPATHLEN, "%s/%s",
+				    prefix, hdp->hld_name);
+
+			error = hyprlofs_get_all_entries(tvp, hcp, path,
+			    &cnt, n_max, cr, ct, flags);
+
+			if (error == E2BIG) {
+				too_big = 1;
+				error = 0;
+			}
+			if (error != 0)
+				goto done;
+		} else {
+			if (cnt < n_max) {
+				char *p;
+
+				if (*prefix == '\0')
+					(void) strlcpy(path, hdp->hld_name,
+					    MAXPATHLEN);
+				else
+					(void) snprintf(path, MAXPATHLEN,
+					    "%s/%s", prefix, hdp->hld_name);
+
+				len = strlen(path);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(path, (void *)(hcp[cnt].hce_name),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+
+				tvp = REALVP(HLNTOV(fndhp));
+				if (tvp->v_path == NULL) {
+					p = "<unknown>";
+				} else {
+					p = tvp->v_path;
+				}
+				len = strlen(p);
+				ASSERT(len <= MAXPATHLEN);
+				if (copyout(p, (void *)(hcp[cnt].hce_path),
+				    len)) {
+					error = EFAULT;
+					goto done;
+				}
+			}
+
+			cnt++;
+			if (cnt > n_max)
+				too_big = 1;
+		}
+
+		hdp = hdp->hld_next;
+	}
+
+done:
+	hlnode_rele(hp);
+	kmem_free(path, MAXPATHLEN);
+
+	*pcnt = cnt;
+	if (error == 0 && too_big == 1)
+		error = E2BIG;
+
+	return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	int limit, cnt, error;
+	model_t model;
+	hyprlofs_curr_entry_t *e;
+
+	model = get_udatamodel();
+
+	if (model == DATAMODEL_NATIVE) {
+		hyprlofs_curr_entries_t ebuf;
+
+		if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+			return (EFAULT);
+		limit = ebuf.hce_cnt;
+		e = ebuf.hce_entries;
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+
+	} else {
+		hyprlofs_curr_entries32_t ebuf32;
+
+		if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+			return (EFAULT);
+
+		limit = ebuf32.hce_cnt;
+		e = (hyprlofs_curr_entry_t *)(unsigned long)
+		    (ebuf32.hce_entries);
+		if (limit > MAX_IOCTL_PARAMS)
+			return (EINVAL);
+	}
+
+	cnt = 0;
+	error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+	    flags);
+
+	if (error == 0 || error == E2BIG) {
+		if (model == DATAMODEL_NATIVE) {
+			hyprlofs_curr_entries_t ebuf;
+
+			ebuf.hce_cnt = cnt;
+			if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+				return (EFAULT);
+
+		} else {
+			hyprlofs_curr_entries32_t ebuf32;
+
+			ebuf32.hce_cnt = cnt;
+			if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+				return (EFAULT);
+		}
+	}
+
+	return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+    int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	int error;
+	hlnode_t *hp = NULL;
+
+	/* This holds the hp vnode */
+	error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+	if (error)
+		return (error);
+
+	ASSERT(hp);
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+	rw_exit(&hp->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(hp);
+
+	return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+	hlnode_t *self = NULL;
+	vnode_t *vp;
+	int error = 0;
+
+	/* Return error if removing . or .. */
+	if (strcmp(nm, ".") == 0)
+		return (EINVAL);
+	if (strcmp(nm, "..") == 0)
+		return (EEXIST); /* Should be ENOTEMPTY */
+	error = hyprlofs_dirlookup(parent, nm, &self, cr);
+	if (error)
+		return (error);
+
+	rw_enter(&parent->hln_rwlock, RW_WRITER);
+	rw_enter(&self->hln_rwlock, RW_WRITER);
+
+	vp = HLNTOV(self);
+	if (vp == dvp || vp == cdir) {
+		error = EINVAL;
+		goto done1;
+	}
+	if (self->hln_type != VDIR) {
+		error = ENOTDIR;
+		goto done1;
+	}
+
+	/*
+	 * When a dir is looped in, we only remove the in-memory dir, not the
+	 * backing dir.
+	 */
+	if (self->hln_looped == 0) {
+		mutex_enter(&self->hln_tlock);
+		if (self->hln_nlink > 2) {
+			mutex_exit(&self->hln_tlock);
+			error = EEXIST;
+			goto done1;
+		}
+		mutex_exit(&self->hln_tlock);
+
+		if (vn_vfswlock(vp)) {
+			error = EBUSY;
+			goto done1;
+		}
+		if (vn_mountedvfs(vp) != NULL) {
+			error = EBUSY;
+			goto done;
+		}
+
+		/*
+		 * Check for an empty directory, i.e. only includes entries for
+		 * "." and ".."
+		 */
+		if (self->hln_dirents > 2) {
+			error = EEXIST;		/* SIGH should be ENOTEMPTY */
+			/*
+			 * Update atime because checking hln_dirents is
+			 * equivalent to reading the directory
+			 */
+			gethrestime(&self->hln_atime);
+			goto done;
+		}
+
+		error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+	} else {
+		error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+	}
+
+done:
+	if (self->hln_looped == 0)
+		vn_vfsunlock(vp);
+done1:
+	rw_exit(&self->hln_rwlock);
+	rw_exit(&parent->hln_rwlock);
+	vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+	/*
+	 * We've now dropped the dir link so by rele-ing our vnode we should
+	 * clean up in hyprlofs_inactive.
+	 */
+	hlnode_rele(self);
+
+	return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hldirent_t *hdp;
+	int error = 0;
+	size_t namelen;
+	struct dirent64 *dp;
+	ulong_t offset;
+	ulong_t total_bytes_wanted;
+	long outcount = 0;
+	long bufsize;
+	int reclen;
+	caddr_t outbuf;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+	if (uiop->uio_loffset >= MAXOFF_T) {
+		if (eofp)
+			*eofp = 1;
+		return (0);
+	}
+	/* assuming syscall has already called hln_rwlock */
+	ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+	if (uiop->uio_iovcnt != 1)
+		return (EINVAL);
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	/*
+	 * There's a window here where someone could have removed
+	 * all the entries in the directory after we put a hold on the
+	 * vnode but before we grabbed the rwlock.  Just return.
+	 */
+	if (hp->hln_dir == NULL) {
+		if (hp->hln_nlink) {
+			panic("empty directory 0x%p", (void *)hp);
+			/*NOTREACHED*/
+		}
+		return (0);
+	}
+
+	/* Get space for multiple dir entries */
+	total_bytes_wanted = uiop->uio_iov->iov_len;
+	bufsize = total_bytes_wanted + sizeof (struct dirent64);
+	outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+	dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+	offset = 0;
+	hdp = hp->hln_dir;
+	while (hdp) {
+		namelen = strlen(hdp->hld_name);	/* no +1 needed */
+		offset = hdp->hld_offset;
+		if (offset >= uiop->uio_offset) {
+			reclen = (int)DIRENT64_RECLEN(namelen);
+			if (outcount + reclen > total_bytes_wanted) {
+				if (!outcount)
+					/* Buffer too small for any entries. */
+					error = EINVAL;
+				break;
+			}
+			ASSERT(hdp->hld_hlnode != NULL);
+
+			/* zero out uninitialized bytes */
+			(void) strncpy(dp->d_name, hdp->hld_name,
+			    DIRENT64_NAMELEN(reclen));
+			dp->d_reclen = (ushort_t)reclen;
+			dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+			dp->d_off = (offset_t)hdp->hld_offset + 1;
+			dp = (struct dirent64 *)
+			    ((uintptr_t)dp + dp->d_reclen);
+			outcount += reclen;
+			ASSERT(outcount <= bufsize);
+		}
+		hdp = hdp->hld_next;
+	}
+
+	if (!error)
+		error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+	if (!error) {
+		/*
+		 * If we reached the end of the list our offset should now be
+		 * just past the end.
+		 */
+		if (!hdp) {
+			offset += 1;
+			if (eofp)
+				*eofp = 1;
+		} else if (eofp)
+			*eofp = 0;
+		uiop->uio_offset = offset;
+	}
+	gethrestime(&hp->hln_atime);
+	kmem_free(outbuf, bufsize);
+	return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+	rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+	mutex_enter(&hp->hln_tlock);
+	mutex_enter(&vp->v_lock);
+	ASSERT(vp->v_count >= 1);
+
+	/*
+	 * If we don't have the last hold or the link count is non-zero,
+	 * there's nothing to do except drop our hold.
+	 */
+	if (vp->v_count > 1 || hp->hln_nlink != 0) {
+		vp->v_count--;
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&hp->hln_tlock);
+		rw_exit(&hp->hln_rwlock);
+		return;
+	}
+
+	mutex_exit(&vp->v_lock);
+	mutex_exit(&hp->hln_tlock);
+
+	/* release hold on the real vnode now */
+	if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+		VN_RELE(hp->hln_realvp);
+
+	/* Here's our chance to send invalid event while we're between locks */
+	vn_invalid(HLNTOV(hp));
+
+	mutex_enter(&hm->hlm_contents);
+	if (hp->hln_forw == NULL)
+		hm->hlm_rootnode->hln_back = hp->hln_back;
+	else
+		hp->hln_forw->hln_back = hp->hln_back;
+	hp->hln_back->hln_forw = hp->hln_forw;
+	mutex_exit(&hm->hlm_contents);
+	rw_exit(&hp->hln_rwlock);
+	rw_destroy(&hp->hln_rwlock);
+	mutex_destroy(&hp->hln_tlock);
+	vn_free(HLNTOV(hp));
+	hyprlofs_memfree(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+	hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+	hlfid_t *hfid;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_FID(REALVP(vp), fidp, ct));
+
+	if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+		fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+		return (ENOSPC);
+	}
+
+	hfid = (hlfid_t *)fidp;
+	bzero(hfid, sizeof (hlfid_t));
+	hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+	hfid->hlfid_ino = hp->hln_nodeid;
+	hfid->hlfid_gen = hp->hln_gen;
+
+	return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+    cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+	    rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+    cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+	    cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+	    flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	ASSERT(VTOHLN(vp)->hln_looped == 1);
+	return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+    caller_context_t *ct)
+{
+	if (VTOHLN(vp)->hln_looped == 0)
+		return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+	return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1)
+		return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+	if (write_lock) {
+		rw_enter(&hp->hln_rwlock, RW_WRITER);
+	} else {
+		rw_enter(&hp->hln_rwlock, RW_READER);
+	}
+	return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+	hlnode_t *hp = VTOHLN(vp);
+
+	if (hp->hln_looped == 1) {
+		VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+		return;
+	}
+
+	rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	int error;
+
+	if (VTOHLN(vp)->hln_looped == 1)
+		return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+	switch (cmd) {
+	case _PC_XATTR_ENABLED:
+	case _PC_XATTR_EXISTS:
+	case _PC_SATTR_ENABLED:
+	case _PC_SATTR_EXISTS:
+		error = EINVAL;
+		break;
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		error = 0;
+		break;
+	default:
+		error = fs_pathconf(vp, cmd, valp, cr, ct);
+	}
+	return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = hyprlofs_open },
+	VOPNAME_CLOSE,		{ .vop_close = hyprlofs_close },
+	VOPNAME_READ,		{ .vop_read = hyprlofs_read },
+	VOPNAME_WRITE,		{ .vop_write = hyprlofs_write },
+	VOPNAME_IOCTL,		{ .vop_ioctl = hyprlofs_ioctl },
+	VOPNAME_GETATTR,	{ .vop_getattr = hyprlofs_getattr },
+	VOPNAME_SETATTR,	{ .vop_setattr = hyprlofs_setattr },
+	VOPNAME_ACCESS,		{ .vop_access = hyprlofs_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = hyprlofs_lookup },
+	VOPNAME_CREATE,		{ .error = fs_error },
+	VOPNAME_REMOVE,		{ .vop_remove = hyprlofs_remove },
+	VOPNAME_LINK,		{ .error = fs_error },
+	VOPNAME_RENAME,		{ .error = fs_error },
+	VOPNAME_MKDIR,		{ .error = fs_error },
+	VOPNAME_RMDIR,		{ .vop_rmdir = hyprlofs_rmdir },
+	VOPNAME_READDIR,	{ .vop_readdir = hyprlofs_readdir },
+	VOPNAME_SYMLINK,	{ .error = fs_error },
+	VOPNAME_READLINK,	{ .error = fs_error },
+	VOPNAME_FSYNC,		{ .vop_fsync = hyprlofs_fsync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = hyprlofs_inactive },
+	VOPNAME_FID,		{ .vop_fid = hyprlofs_fid },
+	VOPNAME_RWLOCK,		{ .vop_rwlock = hyprlofs_rwlock },
+	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = hyprlofs_rwunlock },
+	VOPNAME_SEEK,		{ .vop_seek = hyprlofs_seek },
+	VOPNAME_SPACE,		{ .vop_space = hyprlofs_space },
+	VOPNAME_GETPAGE,	{ .vop_getpage = hyprlofs_getpage },
+	VOPNAME_PUTPAGE,	{ .vop_putpage = hyprlofs_putpage },
+	VOPNAME_MAP,		{ .vop_map = hyprlofs_map },
+	VOPNAME_ADDMAP,		{ .vop_addmap = hyprlofs_addmap },
+	VOPNAME_DELMAP,		{ .vop_delmap = hyprlofs_delmap },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = hyprlofs_pathconf },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..2cd4813e43
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,516 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define	LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+	uio_t *uiop;
+	char *buffer;
+	uint32_t buffsize;
+	char *pos;
+	size_t beg;
+	int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+	/* Allocate memory for both lxpr_uiobuf and output buffer */
+	int bufsize = lxpr_bufsize;
+	struct lxpr_uiobuf *uiobuf =
+	    kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+	uiobuf->uiop = uiop;
+	uiobuf->buffer = (char *)&uiobuf[1];
+	uiobuf->buffsize = bufsize;
+	uiobuf->pos = uiobuf->buffer;
+	uiobuf->beg = 0;
+	uiobuf->error = 0;
+
+	return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+	ASSERT(uiobuf != NULL);
+	ASSERT(uiobuf->pos == uiobuf->buffer);
+
+	kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+	uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+	ASSERT(uiobuf->error == 0);
+
+	uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+	off_t off = uiobuf->uiop->uio_offset;
+	caddr_t uaddr = uiobuf->buffer;
+	size_t beg = uiobuf->beg;
+	size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+	if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		ASSERT(off >= beg);
+
+		if (beg + size > off && off >= 0)
+			uiobuf->error =
+			    uiomove(uaddr + (off - beg), size - (off - beg),
+			    UIO_READ, uiobuf->uiop);
+
+		uiobuf->beg += size;
+	}
+
+	uiobuf->pos = uaddr;
+
+	return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+	/* While we can still carry on */
+	while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+		uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+		    ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+		/* Enough space in buffer? */
+		if (remain >= size) {
+			bcopy(buf, uiobuf->pos, size);
+			uiobuf->pos += size;
+			return;
+		}
+
+		/* Not enough space, so copy all we can and try again */
+		bcopy(buf, uiobuf->pos, remain);
+		uiobuf->pos += remain;
+		(void) lxpr_uiobuf_flush(uiobuf);
+		buf += remain;
+		size -= remain;
+	}
+}
+
+#define	TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+	va_list args;
+	char buff[TYPBUFFSIZE];
+	int len;
+	char *buffer;
+
+	/* Can we still do any output */
+	if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+		return;
+
+	va_start(args, fmt);
+
+	/* Try using stack allocated buffer */
+	len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+	if (len < TYPBUFFSIZE) {
+		va_end(args);
+		lxpr_uiobuf_write(uiobuf, buff, len);
+		return;
+	}
+
+	/* Not enough space in pre-allocated buffer */
+	buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+	/*
+	 * We know we allocated the correct amount of space
+	 * so no check on the return value
+	 */
+	(void) vsnprintf(buffer, len+1, fmt, args);
+	lxpr_uiobuf_write(uiobuf, buffer, len);
+	va_end(args);
+	kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	for (;;) {
+		mutex_enter(&pidlock);
+
+		/*
+		 * If the pid is 1, we really want the zone's init process
+		 */
+		p = prfind((pid == 1) ?
+		    curproc->p_zone->zone_proc_initpid : pid);
+
+		if (p == NULL || p->p_stat == SIDL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+
+		mutex_exit(&pidlock);
+
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(!MUTEX_HELD(&pidlock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+	lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+	    sizeof (lxpr_node_t), 0,
+	    lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+	kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+	lxpr_node_t	*lxpnp = buf;
+	vnode_t		*vp;
+
+	vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+	if (vp == NULL)
+		return (-1);
+
+	(void) vn_setops(vp, lxpr_vnodeops);
+	vp->v_data = lxpnp;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+	lxpr_node_t	*lxpnp = buf;
+
+	vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+	if (pid == 1)
+		pid = curproc->p_zone->zone_proc_initpid;
+
+	switch (type) {
+	case LXPR_PIDDIR:
+		return (pid + 1);
+	case LXPR_PROCDIR:
+		return (maxpid + 2);
+	case LXPR_PID_FD_FD:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    LXPR_NFILES + fd);
+	default:
+		return (maxpid + 2 +
+		    (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+		    type);
+	}
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+	/*
+	 * If the input node is the root then the parent inode
+	 * is the mounted on inode so just return our inode number
+	 */
+	if (lxpnp->lxpr_type != LXPR_PROCDIR)
+		return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+	else
+		return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+	lxpr_node_t *lxpnp;
+	vnode_t *vp;
+	user_t *up;
+	timestruc_t now;
+
+	/*
+	 * Allocate a new node. It is deallocated in vop_innactive
+	 */
+	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+	/*
+	 * Set defaults (may be overridden below)
+	 */
+	gethrestime(&now);
+	lxpnp->lxpr_type = type;
+	lxpnp->lxpr_realvp = NULL;
+	lxpnp->lxpr_parent = dp;
+	VN_HOLD(dp);
+	if (p != NULL) {
+		lxpnp->lxpr_pid = ((p->p_pid ==
+		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+		lxpnp->lxpr_time = PTOU(p)->u_start;
+		lxpnp->lxpr_uid = crgetruid(p->p_cred);
+		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+	} else {
+		/* Pretend files without a proc belong to sched */
+		lxpnp->lxpr_pid = 0;
+		lxpnp->lxpr_time = now;
+		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+	}
+
+	/* initialize the vnode data */
+	vp = lxpnp->lxpr_vnode;
+	vn_reinit(vp);
+	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+	vp->v_vfsp = dp->v_vfsp;
+
+	/*
+	 * Do node specific stuff
+	 */
+	switch (type) {
+	case LXPR_PROCDIR:
+		vp->v_flag |= VROOT;
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
+		break;
+
+	case LXPR_PID_CURDIR:
+		ASSERT(p != NULL);
+
+		/*
+		 * Zombie check.  p_stat is officially protected by pidlock,
+		 * but we can't grab pidlock here because we already hold
+		 * p_lock.  Luckily if we look at the process exit code
+		 * we see that p_stat only transisions from SRUN to SZOMB
+		 * while p_lock is held.  Aside from this, the only other
+		 * p_stat transition that we need to be aware about is
+		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
+		 * ignores nodes in the SIDL state so we'll never get a node
+		 * that isn't already in the SRUN state.
+		 */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp = up->u_cdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_ROOTDIR:
+		ASSERT(p != NULL);
+		/* Zombie check.  see locking comment above */
+		if (p->p_stat == SZOMB) {
+			lxpnp->lxpr_realvp = NULL;
+		} else {
+			up = PTOU(p);
+			lxpnp->lxpr_realvp =
+			    up->u_rdir != NULL ? up->u_rdir : rootdir;
+			ASSERT(lxpnp->lxpr_realvp != NULL);
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_EXE:
+		ASSERT(p != NULL);
+		lxpnp->lxpr_realvp = p->p_exec;
+		if (lxpnp->lxpr_realvp != NULL) {
+			VN_HOLD(lxpnp->lxpr_realvp);
+		}
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;
+		break;
+
+	case LXPR_SELF:
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
+		break;
+
+	case LXPR_PID_FD_FD:
+		ASSERT(p != NULL);
+		/* lxpr_realvp is set after we return */
+		vp->v_type = VLNK;
+		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
+		break;
+
+	case LXPR_PID_FDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
+		break;
+
+	case LXPR_PIDDIR:
+		ASSERT(p != NULL);
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0511;
+		break;
+
+	case LXPR_NETDIR:
+		vp->v_type = VDIR;
+		lxpnp->lxpr_mode = 0555;	/* read-search by all */
+		break;
+
+	case LXPR_PID_ENV:
+	case LXPR_PID_MEM:
+		ASSERT(p != NULL);
+		/*FALLTHRU*/
+	case LXPR_KCORE:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
+		break;
+
+	default:
+		vp->v_type = VREG;
+		lxpnp->lxpr_mode = 0444;	/* read-only by all */
+		break;
+	}
+
+	return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+	ASSERT(lxpnp != NULL);
+	ASSERT(LXPTOV(lxpnp) != NULL);
+
+	/*
+	 * delete any association with realvp
+	 */
+	if (lxpnp->lxpr_realvp != NULL)
+		VN_RELE(lxpnp->lxpr_realvp);
+
+	/*
+	 * delete any association with parent vp
+	 */
+	if (lxpnp->lxpr_parent != NULL)
+		VN_RELE(lxpnp->lxpr_parent);
+
+	/*
+	 * Release the lxprnode.
+	 */
+	kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int	lxprocfstype;
+static dev_t	lxprocdev;
+static kmutex_t	lxpr_mount_lock;
+
+int nproc_highbit;	/* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	"lxproc",
+	lxpr_init,
+	VSW_ZMOUNT,
+	NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+	&mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int retval;
+
+	/*
+	 * attempt to unload the module
+	 */
+	if ((retval = mod_remove(&modlinkage)) != 0)
+		goto done;
+
+	/*
+	 * destroy lxpr_node cache
+	 */
+	lxpr_fininodecache();
+
+	/*
+	 * clean out the vfsops and vnodeops
+	 */
+	(void) vfs_freevfsops_by_type(lxprocfstype);
+	vn_freevnodeops(lxpr_vnodeops);
+
+	mutex_destroy(&lxpr_mount_lock);
+done:
+	return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+	static const fs_operation_def_t lxpr_vfsops_template[] = {
+		VFSNAME_MOUNT,		{ .vfs_mount = lxpr_mount },
+		VFSNAME_UNMOUNT,	{ .vfs_unmount = lxpr_unmount },
+		VFSNAME_ROOT,		{ .vfs_root = lxpr_root },
+		VFSNAME_STATVFS,	{ .vfs_statvfs = lxpr_statvfs },
+		NULL,			NULL
+	};
+	extern const fs_operation_def_t lxpr_vnodeops_template[];
+	int error;
+	major_t dev;
+
+	nproc_highbit = highbit(v.v_proc);
+	lxprocfstype = fstype;
+	ASSERT(lxprocfstype != 0);
+
+	mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Associate VFS ops vector with this fstype.
+	 */
+	error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+	if (error != 0) {
+		cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+		return (error);
+	}
+
+	/*
+	 * Set up vnode ops vector too.
+	 */
+	error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+	if (error != 0) {
+		(void) vfs_freevfsops_by_type(fstype);
+		cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+		return (error);
+	}
+
+	/*
+	 * Assign a unique "device" number (used by stat(2)).
+	 */
+	if ((dev = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+		dev = 0;
+	}
+
+	/*
+	 * Make the pseudo device
+	 */
+	lxprocdev = makedevice(dev, 0);
+
+	/*
+	 * Initialize cache for lxpr_nodes
+	 */
+	lxpr_initnodecache();
+
+	return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt;
+	zone_t *zone = curproc->p_zone;
+	ldi_ident_t li;
+	int err;
+
+	/*
+	 * must be root to mount
+	 */
+	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+		return (EPERM);
+
+	/*
+	 * mount point must be a directory
+	 */
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (zone == global_zone) {
+		zone_t *mntzone;
+
+		mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+		zone_rele(mntzone);
+		if (zone != mntzone)
+			return (EBUSY);
+	}
+
+	/*
+	 * Having the resource be anything but "lxproc" doesn't make sense
+	 */
+	vfs_setresource(vfsp, "lxproc", 0);
+
+	lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+	if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+		kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+		return (err);
+	}
+
+	lxpr_mnt->lxprm_li = li;
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * Ensure we don't allow overlaying mounts
+	 */
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		mutex_exit(&lxpr_mount_lock);
+		kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * allocate the first vnode
+	 */
+	zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+	/* Arbitrarily set the parent vnode to the mounted over directory */
+	lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+	/* Correctly set the fs for the root node */
+	lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+	vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+	vfsp->vfs_bsize = DEV_BSIZE;
+	vfsp->vfs_fstype = lxprocfstype;
+	vfsp->vfs_data = (caddr_t)lxpr_mnt;
+	vfsp->vfs_dev = lxprocdev;
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+	lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+	vnode_t *vp;
+	int count;
+
+	ASSERT(lxpr_mnt != NULL);
+	vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+	mutex_enter(&lxpr_mount_lock);
+
+	/*
+	 * must be root to unmount
+	 */
+	if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EPERM);
+	}
+
+	/*
+	 * forced unmount is not supported by this file system
+	 */
+	if (flag & MS_FORCE) {
+		mutex_exit(&lxpr_mount_lock);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Ensure that no vnodes are in use on this mount point.
+	 */
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+	if (count > 1) {
+		mutex_exit(&lxpr_mount_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * purge the dnlc cache for vnode entries
+	 * associated with this file system
+	 */
+	count = dnlc_purge_vfsp(vfsp, 0);
+
+	/*
+	 * free up the lxprnode
+	 */
+	lxpr_freenode(lxpr_mnt->lxprm_node);
+	zone_rele(lxpr_mnt->lxprm_zone);
+	kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+	mutex_exit(&lxpr_mount_lock);
+
+	return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+	vnode_t *vp = LXPTOV(lxpnp);
+
+	VN_HOLD(vp);
+	*vpp = vp;
+	return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+	int n;
+	dev32_t d32;
+	extern uint_t nproc;
+
+	n = v.v_proc - nproc;
+
+	bzero((caddr_t)sp, sizeof (*sp));
+	sp->f_bsize	= DEV_BSIZE;
+	sp->f_frsize	= DEV_BSIZE;
+	sp->f_blocks	= (fsblkcnt64_t)0;
+	sp->f_bfree	= (fsblkcnt64_t)0;
+	sp->f_bavail	= (fsblkcnt64_t)0;
+	sp->f_files	= (fsfilcnt64_t)v.v_proc + 2;
+	sp->f_ffree	= (fsfilcnt64_t)n;
+	sp->f_favail	= (fsfilcnt64_t)n;
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	sp->f_fsid	= d32;
+	/* It is guaranteed that vsw_name will fit in f_basetype */
+	(void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+	sp->f_namemax = 64;		/* quite arbitrary */
+
+	(void) strcpy(sp->f_fstr, "lxproc");
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..c1d6a85d99
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3079 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * The aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop).  However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth.  A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space.  (That is, each thread has in
+ * effect a pid.)  Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all.  Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility.  In short, this is meant to be a best
+ * effort -- no more.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+    caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+    caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+    pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+    pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+    caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define	btok(x)	((x) >> 10)			/* bytes to kbytes */
+#define	ptok(x)	((x) << (PAGESHIFT - 10))	/* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+	VOPNAME_OPEN,		{ .vop_open = lxpr_open },
+	VOPNAME_CLOSE,		{ .vop_close = lxpr_close },
+	VOPNAME_READ,		{ .vop_read = lxpr_read },
+	VOPNAME_GETATTR,	{ .vop_getattr = lxpr_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = lxpr_access },
+	VOPNAME_LOOKUP,		{ .vop_lookup = lxpr_lookup },
+	VOPNAME_READDIR,	{ .vop_readdir = lxpr_readdir },
+	VOPNAME_READLINK,	{ .vop_readlink = lxpr_readlink },
+	VOPNAME_FSYNC,		{ .error = lxpr_sync },
+	VOPNAME_SEEK,		{ .error = lxpr_sync },
+	VOPNAME_INACTIVE,	{ .vop_inactive = lxpr_inactive },
+	VOPNAME_CMP,		{ .vop_cmp = lxpr_cmp },
+	VOPNAME_REALVP,		{ .vop_realvp = lxpr_realvp },
+	NULL,			NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+	{ LXPR_CMDLINE,		"cmdline" },
+	{ LXPR_CPUINFO,		"cpuinfo" },
+	{ LXPR_DEVICES,		"devices" },
+	{ LXPR_DMA,		"dma" },
+	{ LXPR_FILESYSTEMS,	"filesystems" },
+	{ LXPR_INTERRUPTS,	"interrupts" },
+	{ LXPR_IOPORTS,		"ioports" },
+	{ LXPR_KCORE,		"kcore" },
+	{ LXPR_KMSG,		"kmsg" },
+	{ LXPR_LOADAVG,		"loadavg" },
+	{ LXPR_MEMINFO,		"meminfo" },
+	{ LXPR_MOUNTS,		"mounts" },
+	{ LXPR_NETDIR,		"net" },
+	{ LXPR_PARTITIONS,	"partitions" },
+	{ LXPR_SELF,		"self" },
+	{ LXPR_STAT,		"stat" },
+	{ LXPR_UPTIME,		"uptime" },
+	{ LXPR_VERSION,		"version" }
+};
+
+#define	PROCDIRFILES	(sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+	{ LXPR_PID_CMDLINE,	"cmdline" },
+	{ LXPR_PID_CPU,		"cpu" },
+	{ LXPR_PID_CURDIR,	"cwd" },
+	{ LXPR_PID_ENV,		"environ" },
+	{ LXPR_PID_EXE,		"exe" },
+	{ LXPR_PID_MAPS,	"maps" },
+	{ LXPR_PID_MEM,		"mem" },
+	{ LXPR_PID_ROOTDIR,	"root" },
+	{ LXPR_PID_STAT,	"stat" },
+	{ LXPR_PID_STATM,	"statm" },
+	{ LXPR_PID_STATUS,	"status" },
+	{ LXPR_PID_FDDIR,	"fd" }
+};
+
+#define	PIDDIRFILES	(sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+	{ LXPR_NET_ARP,		"arp" },
+	{ LXPR_NET_DEV,		"dev" },
+	{ LXPR_NET_DEV_MCAST,	"dev_mcast" },
+	{ LXPR_NET_IGMP,	"igmp" },
+	{ LXPR_NET_IP_MR_CACHE,	"ip_mr_cache" },
+	{ LXPR_NET_IP_MR_VIF,	"ip_mr_vif" },
+	{ LXPR_NET_MCFILTER,	"mcfilter" },
+	{ LXPR_NET_NETSTAT,	"netstat" },
+	{ LXPR_NET_RAW,		"raw" },
+	{ LXPR_NET_ROUTE,	"route" },
+	{ LXPR_NET_RPC,		"rpc" },
+	{ LXPR_NET_RT_CACHE,	"rt_cache" },
+	{ LXPR_NET_SOCKSTAT,	"sockstat" },
+	{ LXPR_NET_SNMP,	"snmp" },
+	{ LXPR_NET_STAT,	"stat" },
+	{ LXPR_NET_TCP,		"tcp" },
+	{ LXPR_NET_UDP,		"udp" },
+	{ LXPR_NET_UNIX,	"unix" }
+};
+
+#define	NETDIRFILES	(sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * 	====================================
+ * 	| Number | Linux      | Native     |
+ * 	| ====== | =========  | ========== |
+ *	|    7   | SIGBUS     | SIGEMT     |
+ *	|   10   | SIGUSR1    | SIGBUS     |
+ *	|   12   | SIGUSR2    | SIGSYS     |
+ *	|   16   | SIGSTKFLT  | SIGUSR1    |
+ *	|   17   | SIGCHLD    | SIGUSR2    |
+ * 	|   18   | SIGCONT    | SIGCHLD    |
+ *	|   19   | SIGSTOP    | SIGPWR     |
+ * 	|   20   | SIGTSTP    | SIGWINCH   |
+ * 	|   21   | SIGTTIN    | SIGURG     |
+ * 	|   22   | SIGTTOU    | SIGPOLL    |
+ *	|   23   | SIGURG     | SIGSTOP    |
+ * 	|   24   | SIGXCPU    | SIGTSTP    |
+ *	|   25   | SIGXFSZ    | SIGCONT    |
+ *	|   26   | SIGVTALARM | SIGTTIN    |
+ *	|   27   | SIGPROF    | SIGTTOU    |
+ *	|   28   | SIGWINCH   | SIGVTALARM |
+ *	|   29   | SIGPOLL    | SIGPROF    |
+ *	|   30   | SIGPWR     | SIGXCPU    |
+ *	|   31   | SIGSYS     | SIGXFSZ    |
+ * 	====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+	0,
+	LX_SIGHUP,
+	LX_SIGINT,
+	LX_SIGQUIT,
+	LX_SIGILL,
+	LX_SIGTRAP,
+	LX_SIGABRT,
+	LX_SIGSTKFLT,
+	LX_SIGFPE,
+	LX_SIGKILL,
+	LX_SIGBUS,
+	LX_SIGSEGV,
+	LX_SIGSYS,
+	LX_SIGPIPE,
+	LX_SIGALRM,
+	LX_SIGTERM,
+	LX_SIGUSR1,
+	LX_SIGUSR2,
+	LX_SIGCHLD,
+	LX_SIGPWR,
+	LX_SIGWINCH,
+	LX_SIGURG,
+	LX_SIGPOLL,
+	LX_SIGSTOP,
+	LX_SIGTSTP,
+	LX_SIGCONT,
+	LX_SIGTTIN,
+	LX_SIGTTOU,
+	LX_SIGVTALRM,
+	LX_SIGPROF,
+	LX_SIGXCPU,
+	LX_SIGXFSZ,
+	-1,			/* 32:  illumos SIGWAITING */
+	-1,			/* 33:  illumos SIGLWP */
+	-1,			/* 34:  illumos SIGFREEZE */
+	-1,			/* 35:  illumos SIGTHAW */
+	-1,			/* 36:  illumos SIGCANCEL */
+	-1,			/* 37:  illumos SIGLOST */
+	-1,			/* 38:  illumos SIGXRES */
+	-1,			/* 39:  illumos SIGJVM1 */
+	-1,			/* 40:  illumos SIGJVM2 */
+	LX_SIGRTMIN,		/* 41:  illumos _SIGRTMIN */
+	LX_SIGRTMIN + 1,
+	LX_SIGRTMIN + 2,
+	LX_SIGRTMIN + 3,
+	LX_SIGRTMIN + 4,
+	LX_SIGRTMIN + 5,
+	LX_SIGRTMIN + 6,
+	LX_SIGRTMIN + 7,
+	LX_SIGRTMIN + 8,
+	LX_SIGRTMIN + 9,
+	LX_SIGRTMIN + 10,
+	LX_SIGRTMIN + 11,
+	LX_SIGRTMIN + 12,
+	LX_SIGRTMIN + 13,
+	LX_SIGRTMIN + 14,
+	LX_SIGRTMIN + 15,
+	LX_SIGRTMIN + 16,
+	LX_SIGRTMIN + 17,
+	LX_SIGRTMIN + 18,
+	LX_SIGRTMIN + 19,
+	LX_SIGRTMIN + 20,
+	LX_SIGRTMIN + 21,
+	LX_SIGRTMIN + 22,
+	LX_SIGRTMIN + 23,
+	LX_SIGRTMIN + 24,
+	LX_SIGRTMIN + 25,
+	LX_SIGRTMIN + 26,
+	LX_SIGRTMIN + 27,
+	LX_SIGRTMIN + 28,
+	LX_SIGRTMIN + 29,
+	LX_SIGRTMIN + 30,
+	LX_SIGRTMAX,
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+	vnode_t		*vp = *vpp;
+	lxpr_node_t	*lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpnp->lxpr_type;
+	vnode_t		*rvp;
+	int		error = 0;
+
+	/*
+	 * We only allow reading in this file systrem
+	 */
+	if (flag & FWRITE)
+		return (EROFS);
+
+	/*
+	 * If we are opening an underlying file only allow regular files
+	 * reject the open for anything but a regular file.
+	 * Just do it if we are opening the current or root directory.
+	 */
+	if (lxpnp->lxpr_realvp != NULL) {
+		rvp = lxpnp->lxpr_realvp;
+
+		if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+			error = EACCES;
+		else {
+			/*
+			 * Need to hold rvp since VOP_OPEN() may release it.
+			 */
+			VN_HOLD(rvp);
+			error = VOP_OPEN(&rvp, flag, cr, ct);
+			if (error) {
+				VN_RELE(rvp);
+			} else {
+				*vpp = rvp;
+				VN_RELE(vp);
+			}
+		}
+	}
+
+	if (type == LXPR_KMSG) {
+		ldi_ident_t	li = VTOLXPM(vp)->lxprm_li;
+		struct strioctl	str;
+		int		rv;
+
+		/*
+		 * Open the zone's console device using the layered driver
+		 * interface.
+		 */
+		if ((error = ldi_open_by_name("/dev/log", FREAD, cr,
+		    &lxpnp->lxpr_cons_ldih, li)) != 0)
+			return (error);
+
+		/*
+		 * Send an ioctl to the underlying console device, letting it
+		 * know we're interested in getting console messages.
+		 */
+		str.ic_cmd = I_CONSLOG;
+		str.ic_timout = 0;
+		str.ic_len = 0;
+		str.ic_dp = NULL;
+		if ((error = ldi_ioctl(lxpnp->lxpr_cons_ldih, I_STR,
+		    (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+			return (error);
+	}
+
+	return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t	*lxpr = VTOLXP(vp);
+	lxpr_nodetype_t	type = lxpr->lxpr_type;
+	int		err;
+
+	/*
+	 * we should never get here because the close is done on the realvp
+	 * for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR &&
+	    type != LXPR_PID_EXE);
+
+	if (type == LXPR_KMSG) {
+		if ((err = ldi_close(lxpr->lxpr_cons_ldih, 0, cr)) != 0)
+			return (err);
+	}
+
+	return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+	lxpr_read_isdir,		/* /proc		*/
+	lxpr_read_isdir,		/* /proc/<pid>		*/
+	lxpr_read_pid_cmdline,		/* /proc/<pid>/cmdline	*/
+	lxpr_read_empty,		/* /proc/<pid>/cpu	*/
+	lxpr_read_invalid,		/* /proc/<pid>/cwd	*/
+	lxpr_read_empty,		/* /proc/<pid>/environ	*/
+	lxpr_read_invalid,		/* /proc/<pid>/exe	*/
+	lxpr_read_pid_maps,		/* /proc/<pid>/maps	*/
+	lxpr_read_empty,		/* /proc/<pid>/mem	*/
+	lxpr_read_invalid,		/* /proc/<pid>/root	*/
+	lxpr_read_pid_stat,		/* /proc/<pid>/stat	*/
+	lxpr_read_pid_statm,		/* /proc/<pid>/statm	*/
+	lxpr_read_pid_status,		/* /proc/<pid>/status	*/
+	lxpr_read_isdir,		/* /proc/<pid>/fd	*/
+	lxpr_read_fd,			/* /proc/<pid>/fd/nn	*/
+	lxpr_read_empty,		/* /proc/cmdline	*/
+	lxpr_read_cpuinfo,		/* /proc/cpuinfo	*/
+	lxpr_read_empty,		/* /proc/devices	*/
+	lxpr_read_empty,		/* /proc/dma		*/
+	lxpr_read_empty,		/* /proc/filesystems	*/
+	lxpr_read_empty,		/* /proc/interrupts	*/
+	lxpr_read_empty,		/* /proc/ioports	*/
+	lxpr_read_empty,		/* /proc/kcore		*/
+	lxpr_read_kmsg,			/* /proc/kmsg		*/
+	lxpr_read_loadavg,		/* /proc/loadavg	*/
+	lxpr_read_meminfo,		/* /proc/meminfo	*/
+	lxpr_read_mounts,		/* /proc/mounts		*/
+	lxpr_read_isdir,		/* /proc/net		*/
+	lxpr_read_net_arp,		/* /proc/net/arp	*/
+	lxpr_read_net_dev,		/* /proc/net/dev	*/
+	lxpr_read_net_dev_mcast,	/* /proc/net/dev_mcast	*/
+	lxpr_read_net_igmp,		/* /proc/net/igmp	*/
+	lxpr_read_net_ip_mr_cache,	/* /proc/net/ip_mr_cache */
+	lxpr_read_net_ip_mr_vif,	/* /proc/net/ip_mr_vif	*/
+	lxpr_read_net_mcfilter,		/* /proc/net/mcfilter	*/
+	lxpr_read_net_netstat,		/* /proc/net/netstat	*/
+	lxpr_read_net_raw,		/* /proc/net/raw	*/
+	lxpr_read_net_route,		/* /proc/net/route	*/
+	lxpr_read_net_rpc,		/* /proc/net/rpc	*/
+	lxpr_read_net_rt_cache,		/* /proc/net/rt_cache	*/
+	lxpr_read_net_sockstat,		/* /proc/net/sockstat	*/
+	lxpr_read_net_snmp,		/* /proc/net/snmp	*/
+	lxpr_read_net_stat,		/* /proc/net/stat	*/
+	lxpr_read_net_tcp,		/* /proc/net/tcp	*/
+	lxpr_read_net_udp,		/* /proc/net/udp	*/
+	lxpr_read_net_unix,		/* /proc/net/unix	*/
+	lxpr_read_partitions,		/* /proc/partitions	*/
+	lxpr_read_invalid,		/* /proc/self		*/
+	lxpr_read_stat,			/* /proc/stat		*/
+	lxpr_read_uptime,		/* /proc/uptime		*/
+	lxpr_read_version,		/* /proc/version	*/
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+	lxpr_lookup_procdir,		/* /proc		*/
+	lxpr_lookup_piddir,		/* /proc/<pid>		*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_lookup_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_lookup_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_lookup_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/devices	*/
+	lxpr_lookup_not_a_dir,		/* /proc/dma		*/
+	lxpr_lookup_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_lookup_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_lookup_not_a_dir,		/* /proc/ioports	*/
+	lxpr_lookup_not_a_dir,		/* /proc/kcore		*/
+	lxpr_lookup_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_lookup_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_lookup_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_lookup_not_a_dir,		/* /proc/mounts		*/
+	lxpr_lookup_netdir,		/* /proc/net		*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_lookup_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/route	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_lookup_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_lookup_not_a_dir,		/* /proc/partitions	*/
+	lxpr_lookup_not_a_dir,		/* /proc/self		*/
+	lxpr_lookup_not_a_dir,		/* /proc/stat		*/
+	lxpr_lookup_not_a_dir,		/* /proc/uptime		*/
+	lxpr_lookup_not_a_dir,		/* /proc/version	*/
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+	lxpr_readdir_procdir,		/* /proc		*/
+	lxpr_readdir_piddir,		/* /proc/<pid>		*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cpu	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/cwd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/environ	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/exe	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/maps	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/mem	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/root	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/statm	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/status	*/
+	lxpr_readdir_fddir,		/* /proc/<pid>/fd	*/
+	lxpr_readdir_not_a_dir,		/* /proc/<pid>/fd/nn	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cmdline	*/
+	lxpr_readdir_not_a_dir,		/* /proc/cpuinfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/devices	*/
+	lxpr_readdir_not_a_dir,		/* /proc/dma		*/
+	lxpr_readdir_not_a_dir,		/* /proc/filesystems	*/
+	lxpr_readdir_not_a_dir,		/* /proc/interrupts	*/
+	lxpr_readdir_not_a_dir,		/* /proc/ioports	*/
+	lxpr_readdir_not_a_dir,		/* /proc/kcore		*/
+	lxpr_readdir_not_a_dir,		/* /proc/kmsg		*/
+	lxpr_readdir_not_a_dir,		/* /proc/loadavg	*/
+	lxpr_readdir_not_a_dir,		/* /proc/meminfo	*/
+	lxpr_readdir_not_a_dir,		/* /proc/mounts		*/
+	lxpr_readdir_netdir,		/* /proc/net		*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/arp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/dev_mcast	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/igmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_cache */
+	lxpr_readdir_not_a_dir,		/* /proc/net/ip_mr_vif	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/mcfilter	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/netstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/raw	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/route	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rpc	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/rt_cache	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/sockstat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/snmp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/stat	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/tcp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/udp	*/
+	lxpr_readdir_not_a_dir,		/* /proc/net/unix	*/
+	lxpr_readdir_not_a_dir,		/* /proc/partitions	*/
+	lxpr_readdir_not_a_dir,		/* /proc/self		*/
+	lxpr_readdir_not_a_dir,		/* /proc/stat		*/
+	lxpr_readdir_not_a_dir,		/* /proc/uptime		*/
+	lxpr_readdir_not_a_dir,		/* /proc/version	*/
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+    caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+	int error;
+
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	lxpr_read_function[type](lxpnp, uiobuf);
+
+	error = lxpr_uiobuf_flush(uiobuf);
+	lxpr_uiobuf_free(uiobuf);
+
+	return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ *   but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context.  This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	char *buf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+	lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+	lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	struct seg *seg;
+	char *buf;
+	int buflen = MAXPATHLEN;
+	struct print_data {
+		caddr_t saddr;
+		caddr_t eaddr;
+		int type;
+		char prot[5];
+		uint32_t offset;
+		vnode_t *vp;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *pbuf;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	if (as == &kas) {
+		lxpr_unlock(p);
+		return;
+	}
+
+	mutex_exit(&p->p_lock);
+
+	/* Iterate over all segments in the address space */
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+		vnode_t *vp;
+		uint_t protbits;
+
+		pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+		pbuf->saddr = seg->s_base;
+		pbuf->eaddr = seg->s_base+seg->s_size;
+		pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+		/*
+		 * Cheat and only use the protection bits of the first page
+		 * in the segment
+		 */
+		(void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+		(void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+		if (protbits & PROT_READ)	   pbuf->prot[0] = 'r';
+		if (protbits & PROT_WRITE)	   pbuf->prot[1] = 'w';
+		if (protbits & PROT_EXEC)	   pbuf->prot[2] = 'x';
+		if (pbuf->type & MAP_SHARED)	   pbuf->prot[3] = 's';
+		else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+		if (seg->s_ops == &segvn_ops &&
+		    SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+		    vp != NULL && vp->v_type == VREG) {
+			VN_HOLD(vp);
+			pbuf->vp = vp;
+		} else {
+			pbuf->vp = NULL;
+		}
+
+		pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+		pbuf->next = NULL;
+		*print_tail = pbuf;
+		print_tail = &pbuf->next;
+	}
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	/* print the data we've extracted */
+	pbuf = print_head;
+	while (pbuf != NULL) {
+		struct print_data *pbuf_next;
+		vattr_t vattr;
+
+		int maj = 0;
+		int min = 0;
+		u_longlong_t inode = 0;
+
+		*buf = '\0';
+		if (pbuf->vp != NULL) {
+			vattr.va_mask = AT_FSID | AT_NODEID;
+			if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+			    NULL) == 0) {
+				maj = getmajor(vattr.va_fsid);
+				min = getminor(vattr.va_fsid);
+				inode = vattr.va_nodeid;
+			}
+			(void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+			VN_RELE(pbuf->vp);
+		}
+
+		if (*buf != '\0') {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode, buf);
+		} else {
+			lxpr_uiobuf_printf(uiobuf,
+			    "%08x-%08x %s %08x %02d:%03d %lld\n",
+			    pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+			    maj, min, inode);
+		}
+
+		pbuf_next = pbuf->next;
+		kmem_free(pbuf, sizeof (*pbuf));
+		pbuf = pbuf_next;
+	}
+
+	kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	struct as *as;
+	size_t vsize;
+	size_t rss;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	as = p->p_as;
+
+	mutex_exit(&p->p_lock);
+
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = btopr(as->a_resvsize);
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+
+	mutex_enter(&p->p_lock);
+	lxpr_unlock(p);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%lu %lu %lu %lu %lu %lu %lu\n",
+	    vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	user_t *up;
+	cred_t *cr;
+	const gid_t *groups;
+	int    ngroups;
+	struct as *as;
+	char *status;
+	pid_t pid, ppid;
+	size_t vsize;
+	size_t rss;
+	k_sigset_t current, ignore, handle;
+	int    i, lx_sig;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Convert pid to the Linux default of 1 if we're the zone's init
+	 * process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;
+		ppid = 0;	/* parent pid for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP)
+		    ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			status = "S (sleeping)";
+			break;
+		case TS_RUN:
+		case TS_ONPROC:
+			status = "R (running)";
+			break;
+		case TS_ZOMB:
+			status = "Z (zombie)";
+			break;
+		case TS_STOPPED:
+			status = "T (stopped)";
+			break;
+		default:
+			status = "! (unknown)";
+			break;
+		}
+		thread_unlock(t);
+	} else {
+		/*
+		 * there is a hole in the exit code, where a proc can have
+		 * no threads but it is yet to be flagged SZOMB. We will
+		 * assume we are about to become a zombie
+		 */
+		status = "Z (zombie)";
+	}
+
+	up = PTOU(p);
+	mutex_enter(&p->p_crlock);
+	crhold(cr = p->p_cred);
+	mutex_exit(&p->p_crlock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "Name:\t%s\n"
+	    "State:\t%s\n"
+	    "Tgid:\t%d\n"
+	    "Pid:\t%d\n"
+	    "PPid:\t%d\n"
+	    "TracerPid:\t%d\n"
+	    "Uid:\t%u\t%u\t%u\t%u\n"
+	    "Gid:\t%u\t%u\t%u\t%u\n"
+	    "FDSize:\t%d\n"
+	    "Groups:\t",
+	    up->u_comm,
+	    status,
+	    pid, /* thread group id - same as pid */
+	    pid,
+	    ppid,
+	    0,
+	    crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+	    crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+	    p->p_fno_ctl);
+
+	ngroups = crgetngroups(cr);
+	groups  = crgetgroups(cr);
+	for (i = 0; i < ngroups; i++) {
+		lxpr_uiobuf_printf(uiobuf,
+		    "%u ",
+		    groups[i]);
+	}
+	crfree(cr);
+
+	as = p->p_as;
+	if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+		mutex_exit(&p->p_lock);
+		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+		vsize = as->a_resvsize;
+		rss = rm_asrss(as);
+		AS_LOCK_EXIT(as, &as->a_lock);
+		mutex_enter(&p->p_lock);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "\n"
+		    "VmSize:\t%8lu kB\n"
+		    "VmLck:\t%8lu kB\n"
+		    "VmRSS:\t%8lu kB\n"
+		    "VmData:\t%8lu kB\n"
+		    "VmStk:\t%8lu kB\n"
+		    "VmExe:\t%8lu kB\n"
+		    "VmLib:\t%8lu kB",
+		    btok(vsize),
+		    0l,
+		    ptok(rss),
+		    0l,
+		    btok(p->p_stksize),
+		    ptok(rss),
+		    0l);
+	}
+
+	sigemptyset(&current);
+	sigemptyset(&ignore);
+	sigemptyset(&handle);
+
+	for (i = 1; i < NSIG; i++) {
+		lx_sig = lxpr_sigmap[i];
+
+		if ((lx_sig > 0) && (lx_sig < LX_NSIG)) {
+			if (sigismember(&p->p_sig, i))
+				sigaddset(&current, lx_sig);
+
+			if (up->u_signal[i - 1] == SIG_IGN)
+				sigaddset(&ignore, lx_sig);
+			else if (up->u_signal[i - 1] != SIG_DFL)
+				sigaddset(&handle, lx_sig);
+		}
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "\n"
+	    "SigPnd:\t%08x%08x\n"
+	    "SigBlk:\t%08x%08x\n"
+	    "SigIgn:\t%08x%08x\n"
+	    "SigCgt:\t%08x%08x\n"
+	    "CapInh:\t%016x\n"
+	    "CapPrm:\t%016x\n"
+	    "CapEff:\t%016x\n",
+	    current.__sigbits[1], current.__sigbits[0],
+	    0, 0, /* signals blocked on per thread basis */
+	    ignore.__sigbits[1], ignore.__sigbits[0],
+	    handle.__sigbits[1], handle.__sigbits[0],
+	    /* Can't do anything with linux capabilities */
+	    0,
+	    0,
+	    0);
+
+	lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	proc_t *p;
+	kthread_t *t;
+	struct as *as;
+	char stat;
+	pid_t pid, ppid, pgpid, spid;
+	gid_t psgid;
+	dev_t psdev;
+	size_t rss, vsize;
+	int nice, pri;
+	caddr_t wchan;
+	processorid_t cpu;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL) {
+		lxpr_uiobuf_seterr(uiobuf, EINVAL);
+		return;
+	}
+
+	pid = p->p_pid;
+
+	/*
+	 * Set Linux defaults if we're the zone's init process
+	 */
+	if (pid == curproc->p_zone->zone_proc_initpid) {
+		pid = 1;		/* PID for init */
+		ppid = 0;		/* parent PID for init is 0 */
+		pgpid = 0;		/* process group for init is 0 */
+		psgid = (gid_t)-1;	/* credential GID for init is -1 */
+		spid = 0;		/* session id for init is 0 */
+		psdev = 0;		/* session device for init is 0 */
+	} else {
+		/*
+		 * Make sure not to reference parent PIDs that reside outside
+		 * the zone
+		 */
+		ppid = ((p->p_flag & SZONETOP) ?
+		    curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+		/*
+		 * Convert ppid to the Linux default of 1 if our parent is the
+		 * zone's init process
+		 */
+		if (ppid == curproc->p_zone->zone_proc_initpid)
+			ppid = 1;
+
+		pgpid = p->p_pgrp;
+
+		mutex_enter(&p->p_splock);
+		mutex_enter(&p->p_sessp->s_lock);
+		spid = p->p_sessp->s_sid;
+		psdev = p->p_sessp->s_dev;
+		if (p->p_sessp->s_cred)
+			psgid = crgetgid(p->p_sessp->s_cred);
+		else
+			psgid = crgetgid(p->p_cred);
+
+		mutex_exit(&p->p_sessp->s_lock);
+		mutex_exit(&p->p_splock);
+	}
+
+	t = prchoose(p);
+	if (t != NULL) {
+		switch (t->t_state) {
+		case TS_SLEEP:
+			stat = 'S'; break;
+		case TS_RUN:
+		case TS_ONPROC:
+			stat = 'R'; break;
+		case TS_ZOMB:
+			stat = 'Z'; break;
+		case TS_STOPPED:
+			stat = 'T'; break;
+		default:
+			stat = '!'; break;
+		}
+
+		if (CL_DONICE(t, NULL, 0, &nice) != 0)
+			nice = 0;
+
+		pri = t->t_pri;
+		wchan = t->t_wchan;
+		cpu = t->t_cpu->cpu_id;
+		thread_unlock(t);
+	} else {
+		/* Only zombies have no threads */
+		stat = 'Z';
+		nice = 0;
+		pri = 0;
+		wchan = 0;
+		cpu = 0;
+	}
+	as = p->p_as;
+	mutex_exit(&p->p_lock);
+	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+	vsize = as->a_resvsize;
+	rss = rm_asrss(as);
+	AS_LOCK_EXIT(as, &as->a_lock);
+	mutex_enter(&p->p_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%d (%s) %c %d %d %d %d %d "
+	    "%lu %lu %lu %lu %lu "
+	    "%lu %lu %ld %ld "
+	    "%d %d %d "
+	    "%lu "
+	    "%lu "
+	    "%lu %ld %llu "
+	    "%lu %lu %u "
+	    "%lu %lu "
+	    "%lu %lu %lu %lu "
+	    "%lu "
+	    "%lu %lu "
+	    "%d "
+	    "%d"
+	    "\n",
+	    pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+	    0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+	    p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+	    pri, nice, p->p_lwpcnt,
+	    0l, /* itrealvalue (time before next SIGALRM) */
+	    PTOU(p)->u_ticks,
+	    vsize, rss, p->p_vmem_ctl,
+	    0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+	    0l, 0l, /* kstkesp, kstkeip */
+	    0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+	    wchan,
+	    0l, 0l, /* nswap, cnswap */
+	    0, /* exit_signal */
+	    cpu);
+
+	lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf, "Inter-|   Receive                   "
+	    "                             |  Transmit\n");
+	lxpr_uiobuf_printf(uiobuf, " face |bytes    packets errs drop fifo"
+	    " frame compressed multicast|bytes    packets errs drop fifo"
+	    " colls carrier compressed\n");
+
+	/*
+	 * Data about each interface should go here, but that shouldn't be added
+	 * unless there is an lxproc reader that actually makes use of it (and
+	 * doesn't need anything else that we refuse to provide)...
+	 */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define	LX_KMSG_PRI	"<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf)
+{
+	ldi_handle_t	lh = lxpnp->lxpr_cons_ldih;
+	mblk_t		*mp;
+
+	if (ldi_getmsg(lh, &mp, NULL) == 0) {
+		/*
+		 * lxproc doesn't like successive reads to the same file
+		 * descriptor unless we do an explicit rewind each time.
+		 */
+		lxpr_uiobuf_seek(uiobuf, 0);
+
+		lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+		    mp->b_cont->b_rptr);
+
+		freemsg(mp);
+	}
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file.  We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ulong_t avenrun1;
+	ulong_t avenrun5;
+	ulong_t avenrun15;
+	ulong_t avenrun1_cs;
+	ulong_t avenrun5_cs;
+	ulong_t avenrun15_cs;
+	int loadavg[3];
+	int *loadbuf;
+	cpupart_t *cp;
+	zone_t *zone = LXPTOZ(lxpnp);
+
+	uint_t nrunnable = 0;
+	rctl_qty_t nlwps;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Need to add up values over all CPU partitions. If pools are active,
+	 * only report the values of the zone's partition, which by definition
+	 * includes the current CPU.
+	 */
+	if (pool_pset_enabled()) {
+		psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+		ASSERT(curproc->p_zone != &zone0);
+		cp = CPU->cpu_part;
+
+		nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+		(void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+		loadbuf = &loadavg[0];
+	} else {
+		cp = cp_list_head;
+		do {
+			nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+		} while ((cp = cp->cp_next) != cp_list_head);
+
+		loadbuf = zone == global_zone ?
+		    &avenrun[0] : zone->zone_avenrun;
+	}
+
+	/*
+	 * If we're in the non-global zone, we'll report the total number of
+	 * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+	 * otherwise will just use nthread (which will include kernel threads,
+	 * but should be good enough for lxproc).
+	 */
+	nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+	mutex_exit(&cpu_lock);
+
+	avenrun1 = loadbuf[0] >> FSHIFT;
+	avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun5 = loadbuf[1] >> FSHIFT;
+	avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+	avenrun15 = loadbuf[2] >> FSHIFT;
+	avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+	    avenrun1, avenrun1_cs,
+	    avenrun5, avenrun5_cs,
+	    avenrun15, avenrun15_cs,
+	    nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	zone_t *zone = LXPTOZ(lxpnp);
+	int global = zone == global_zone;
+	long total_mem, free_mem, total_swap, used_swap;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+	if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+		total_mem = physmem * PAGESIZE;
+		free_mem = freemem * PAGESIZE;
+	} else {
+		total_mem = zone->zone_phys_mem_ctl;
+		free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+	}
+
+	if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+		total_swap = k_anoninfo.ani_max * PAGESIZE;
+		used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+	} else {
+		mutex_enter(&zone->zone_mem_lock);
+		total_swap = zone->zone_max_swap_ctl;
+		used_swap = zone->zone_max_swap;
+		mutex_exit(&zone->zone_mem_lock);
+	}
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "        total:     used:    free:  shared: buffers:  cached:\n"
+	    "Mem:  %8lu %8lu %8lu %8u %8u %8u\n"
+	    "Swap: %8lu %8lu %8lu\n"
+	    "MemTotal:  %8lu kB\n"
+	    "MemFree:   %8lu kB\n"
+	    "MemShared: %8u kB\n"
+	    "Buffers:   %8u kB\n"
+	    "Cached:    %8u kB\n"
+	    "SwapCached:%8u kB\n"
+	    "Active:    %8u kB\n"
+	    "Inactive:  %8u kB\n"
+	    "HighTotal: %8u kB\n"
+	    "HighFree:  %8u kB\n"
+	    "LowTotal:  %8u kB\n"
+	    "LowFree:   %8u kB\n"
+	    "SwapTotal: %8lu kB\n"
+	    "SwapFree:  %8lu kB\n",
+	    total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+	    total_swap, used_swap, total_swap - used_swap,
+	    btok(total_mem),				/* MemTotal */
+	    btok(free_mem),				/* MemFree */
+	    0,						/* MemShared */
+	    0,						/* Buffers */
+	    0,						/* Cached */
+	    0,						/* SwapCached */
+	    0,						/* Active */
+	    0,						/* Inactive */
+	    0,						/* HighTotal */
+	    0,						/* HighFree */
+	    btok(total_mem),				/* LowTotal */
+	    btok(free_mem),				/* LowFree */
+	    btok(total_swap),				/* SwapTotal */
+	    btok(total_swap - used_swap));		/* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	struct vfs *vfsp;
+	struct vfs *vfslist;
+	zone_t *zone = LXPTOZ(lxpnp);
+	struct print_data {
+		refstr_t *vfs_mntpt;
+		refstr_t *vfs_resource;
+		uint_t vfs_flag;
+		int vfs_fstype;
+		struct print_data *next;
+	} *print_head = NULL;
+	struct print_data **print_tail = &print_head;
+	struct print_data *printp;
+
+	vfs_list_read_lock();
+
+	if (zone == global_zone) {
+		vfsp = vfslist = rootvfs;
+	} else {
+		vfsp = vfslist = zone->zone_vfslist;
+		/*
+		 * If the zone has a root entry, it will be the first in
+		 * the list.  If it doesn't, we conjure one up.
+		 */
+		if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+		    zone->zone_rootpath) != 0) {
+			struct vfs *tvfsp;
+			/*
+			 * The root of the zone is not a mount point.  The vfs
+			 * we want to report is that of the zone's root vnode.
+			 */
+			tvfsp = zone->zone_rootvp->v_vfsp;
+
+			lxpr_uiobuf_printf(uiobuf,
+			    "/ / %s %s 0 0\n",
+			    vfssw[tvfsp->vfs_fstype].vsw_name,
+			    tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+		}
+		if (vfslist == NULL) {
+			vfs_list_unlock();
+			return;
+		}
+	}
+
+	/*
+	 * Later on we have to do a lookupname, which can end up causing
+	 * another vfs_list_read_lock() to be called. Which can lead to a
+	 * deadlock. To avoid this, we extract the data we need into a local
+	 * list, then we can run this list without holding vfs_list_read_lock()
+	 * We keep the list in the same order as the vfs_list
+	 */
+	do {
+		/* Skip mounts we shouldn't show */
+		if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+			goto nextfs;
+		}
+
+		printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+		refstr_hold(vfsp->vfs_mntpt);
+		printp->vfs_mntpt = vfsp->vfs_mntpt;
+		refstr_hold(vfsp->vfs_resource);
+		printp->vfs_resource = vfsp->vfs_resource;
+		printp->vfs_flag = vfsp->vfs_flag;
+		printp->vfs_fstype = vfsp->vfs_fstype;
+		printp->next = NULL;
+
+		*print_tail = printp;
+		print_tail = &printp->next;
+
+nextfs:
+		vfsp = (zone == global_zone) ?
+		    vfsp->vfs_next : vfsp->vfs_zone_next;
+
+	} while (vfsp != vfslist);
+
+	vfs_list_unlock();
+
+	/*
+	 * now we can run through what we've extracted without holding
+	 * vfs_list_read_lock()
+	 */
+	printp = print_head;
+	while (printp != NULL) {
+		struct print_data *printp_next;
+		const char *resource;
+		char *mntpt;
+		struct vnode *vp;
+		int error;
+
+		mntpt = (char *)refstr_value(printp->vfs_mntpt);
+		resource = refstr_value(printp->vfs_resource);
+
+		if (mntpt != NULL && mntpt[0] != '\0')
+			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+		else
+			mntpt = "-";
+
+		error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+		if (error != 0)
+			goto nextp;
+
+		if (!(vp->v_flag & VROOT)) {
+			VN_RELE(vp);
+			goto nextp;
+		}
+		VN_RELE(vp);
+
+		if (resource != NULL && resource[0] != '\0') {
+			if (resource[0] == '/') {
+				resource = ZONE_PATH_VISIBLE(resource, zone) ?
+				    ZONE_PATH_TRANSLATE(resource, zone) :
+				    mntpt;
+			}
+		} else {
+			resource = "-";
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "%s %s %s %s 0 0\n",
+		    resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+		    printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+		printp_next = printp->next;
+		refstr_rele(printp->vfs_mntpt);
+		refstr_rele(printp->vfs_resource);
+		kmem_free(printp, sizeof (*printp));
+		printp = printp_next;
+
+	}
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices.  But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "major minor  #blocks  name     rio rmerge rsect ruse "
+	    "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file.  Note that
+ * we don't lie here -- we don't pretend that we're Linux.  If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	lxpr_uiobuf_printf(uiobuf,
+	    "%s version %s (%s version %d.%d.%d) "
+	    "#%s SMP %s\n",
+	    utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+	    "gcc",
+	    __GNUC__,
+	    __GNUC_MINOR__,
+	    __GNUC_PATCHLEVEL__,
+#else
+	    "Sun C",
+	    __SUNPRO_C / 0x100,
+	    (__SUNPRO_C & 0xff) / 0x10,
+	    __SUNPRO_C & 0xf,
+#endif
+	    utsname.version,
+	    "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t sys_cum  = 0;
+	ulong_t user_cum = 0;
+	ulong_t irq_cum = 0;
+	uint_t cpu_nrunnable_cum = 0;
+	uint_t w_io_cum = 0;
+
+	ulong_t pgpgin_cum    = 0;
+	ulong_t pgpgout_cum   = 0;
+	ulong_t pgswapout_cum = 0;
+	ulong_t pgswapin_cum  = 0;
+	ulong_t intr_cum = 0;
+	ulong_t pswitch_cum = 0;
+	ulong_t forks_cum = 0;
+	hrtime_t msnsecs[NCMSTATES];
+
+	/* temporary variable since scalehrtime modifies data in place */
+	hrtime_t tmptime;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	/* Calculate cumulative stats */
+	cp = cpstart = CPU->cpu_part->cp_cpulist;
+	do {
+		int i;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_cum  += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+		pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+		pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+		pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+		cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+		w_io_cum += CPU_STATS(cp, sys.iowait);
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_cum += NSEC_TO_TICK(tmptime);
+		}
+
+		for (i = 0; i < PIL_MAX; i++)
+			intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+		pswitch_cum += CPU_STATS(cp, sys.pswitch);
+		forks_cum += CPU_STATS(cp, sys.sysfork);
+		forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	lxpr_uiobuf_printf(uiobuf, "cpu %ld %ld %ld %ld %ld %ld %ld\n",
+	    user_cum, 0, sys_cum, idle_cum, 0, irq_cum, 0);
+
+	/* Do per processor stats */
+	do {
+		int i;
+
+		ulong_t idle_ticks;
+		ulong_t sys_ticks;
+		ulong_t user_ticks;
+		ulong_t irq_ticks = 0;
+
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		get_cpu_mstate(cp, msnsecs);
+
+		idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+		sys_ticks  = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+		user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+		for (i = 0; i < NCMSTATES; i++) {
+			tmptime = cp->cpu_intracct[i];
+			scalehrtime(&tmptime);
+			irq_ticks += NSEC_TO_TICK(tmptime);
+		}
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "cpu%d %ld %ld %ld %ld %ld %ld %ld\n",
+		    cp->cpu_id, user_ticks, 0, sys_ticks, idle_ticks,
+		    0, irq_ticks, 0);
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "page %lu %lu\n"
+	    "swap %lu %lu\n"
+	    "intr %lu\n"
+	    "ctxt %lu\n"
+	    "btime %lu\n"
+	    "processes %lu\n"
+	    "procs_running %lu\n"
+	    "procs_blocked %lu\n",
+	    pgpgin_cum, pgpgout_cum,
+	    pgswapin_cum, pgswapout_cum,
+	    intr_cum,
+	    pswitch_cum,
+	    boot_time,
+	    forks_cum,
+	    cpu_nrunnable_cum,
+	    w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	ulong_t idle_cum = 0;
+	ulong_t cpu_count = 0;
+	ulong_t idle_s;
+	ulong_t idle_cs;
+	ulong_t up_s;
+	ulong_t up_cs;
+	hrtime_t birthtime;
+	hrtime_t centi_sec = 10000000;  /* 10^7 */
+
+	ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+	/* Calculate cumulative stats */
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * Don't count CPUs that aren't even in the system
+		 * or aren't up yet.
+		 */
+		if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+			continue;
+		}
+
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+		idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+		cpu_count += 1;
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+	mutex_exit(&cpu_lock);
+
+	/* Getting the Zone zsched process startup time */
+	birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+	up_cs = (gethrtime() - birthtime) / centi_sec;
+	up_s = up_cs / 100;
+	up_cs %= 100;
+
+	ASSERT(cpu_count > 0);
+	idle_cum /= cpu_count;
+	idle_s = idle_cum / hz;
+	idle_cs = idle_cum % hz;
+	idle_cs *= 100;
+	idle_cs /= hz;
+
+	lxpr_uiobuf_printf(uiobuf,
+	    "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"mp",
+	"nx",	NULL,	"mmxext", NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",	"3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+	"lahf_lm", NULL, "svm", NULL,
+	"altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+	"recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	"syscall",
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	NULL,	NULL,	NULL,
+	"nx",	NULL,	NULL,   NULL,
+	NULL,	NULL,	NULL,	NULL,
+	NULL,	"lm",   NULL,   NULL
+};
+
+static const char *intc_edx[] = {
+	"fpu",	"vme",	"de",	"pse",
+	"tsc",	"msr",	"pae",	"mce",
+	"cx8",	"apic",	 NULL,	"sep",
+	"mtrr",	"pge",	"mca",	"cmov",
+	"pat",	"pse36", "pn",	"clflush",
+	NULL,	"dts",	"acpi",	"mmx",
+	"fxsr",	"sse",	"sse2",	"ss",
+	"ht",	"tm",	"ia64",	"pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+	"pni",	NULL,	NULL, "monitor",
+	"ds_cpl", NULL,	NULL, "est",
+	"tm2",	NULL,	"cid", NULL,
+	NULL,	"cx16",	"xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	int i;
+	uint32_t bits;
+	cpu_t *cp, *cpstart;
+	int pools_enabled;
+	const char **fp;
+	char brandstr[CPU_IDSTRLEN];
+	struct cpuid_regs cpr;
+	int maxeax;
+	int std_ecx, std_edx, ext_ecx, ext_edx;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+	mutex_enter(&cpu_lock);
+	pools_enabled = pool_pset_enabled();
+
+	cp = cpstart = CPU;
+	do {
+		/*
+		 * This returns the maximum eax value for standard cpuid
+		 * functions in eax.
+		 */
+		cpr.cp_eax = 0;
+		(void) cpuid_insn(cp, &cpr);
+		maxeax = cpr.cp_eax;
+
+		/*
+		 * Get standard x86 feature flags.
+		 */
+		cpr.cp_eax = 1;
+		(void) cpuid_insn(cp, &cpr);
+		std_ecx = cpr.cp_ecx;
+		std_edx = cpr.cp_edx;
+
+		/*
+		 * Now get extended feature flags.
+		 */
+		cpr.cp_eax = 0x80000001;
+		(void) cpuid_insn(cp, &cpr);
+		ext_ecx = cpr.cp_ecx;
+		ext_edx = cpr.cp_edx;
+
+		(void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+		lxpr_uiobuf_printf(uiobuf,
+		    "processor\t: %d\n"
+		    "vendor_id\t: %s\n"
+		    "cpu family\t: %d\n"
+		    "model\t\t: %d\n"
+		    "model name\t: %s\n"
+		    "stepping\t: %d\n"
+		    "cpu MHz\t\t: %u.%03u\n",
+		    cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+		    cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+		    (uint32_t)(cpu_freq_hz / 1000000),
+		    ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+		lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+		    getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+		if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+			/*
+			 * 'siblings' is used for HT-style threads
+			 */
+			lxpr_uiobuf_printf(uiobuf,
+			    "physical id\t: %lu\n"
+			    "siblings\t: %u\n",
+			    pg_plat_hw_instance_id(cp, PGHW_CHIP),
+			    cpuid_get_ncpu_per_chip(cp));
+		}
+
+		/*
+		 * Since we're relatively picky about running on older hardware,
+		 * we can be somewhat cavalier about the answers to these ones.
+		 *
+		 * In fact, given the hardware we support, we just say:
+		 *
+		 *	fdiv_bug	: no	(if we're on a 64-bit kernel)
+		 *	hlt_bug		: no
+		 *	f00f_bug	: no
+		 *	coma_bug	: no
+		 *	wp		: yes	(write protect in supervsr mode)
+		 */
+		lxpr_uiobuf_printf(uiobuf,
+		    "fdiv_bug\t: %s\n"
+		    "hlt_bug \t: no\n"
+		    "f00f_bug\t: no\n"
+		    "coma_bug\t: no\n"
+		    "fpu\t\t: %s\n"
+		    "fpu_exception\t: %s\n"
+		    "cpuid level\t: %d\n"
+		    "flags\t\t:",
+#if defined(__i386)
+		    fpu_pentium_fdivbug ? "yes" : "no",
+#else
+		    "no",
+#endif /* __i386 */
+		    fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+		    maxeax);
+
+		for (bits = std_edx, fp = intc_edx, i = 0;
+		    i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		/*
+		 * name additional features where appropriate
+		 */
+		switch (x86_vendor) {
+		case X86_VENDOR_Intel:
+			for (bits = ext_edx, fp = intc_x_edx, i = 0;
+			    i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_AMD:
+			for (bits = ext_edx, fp = amd_x_edx, i = 0;
+			    i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+			for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+			    i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+
+		case X86_VENDOR_TM:
+			for (bits = ext_edx, fp = tm_x_edx, i = 0;
+			    i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+			    fp++, i++)
+				if ((bits & (1 << i)) != 0 && *fp)
+					lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+			break;
+		default:
+			break;
+		}
+
+		for (bits = std_ecx, fp = intc_ecx, i = 0;
+		    i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+			if ((bits & (1 << i)) != 0 && *fp)
+				lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+		lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+		if (pools_enabled)
+			cp = cp->cpu_next_part;
+		else
+			cp = cp->cpu_next;
+	} while (cp != cpstart);
+
+	mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+	lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	register lxpr_node_t *lxpnp = VTOLXP(vp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	extern uint_t nproc;
+	int error;
+
+	/*
+	 * Return attributes of underlying vnode if ATTR_REAL
+	 *
+	 * but keep fd files with the symlink permissions
+	 */
+	if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+		vnode_t *rvp = lxpnp->lxpr_realvp;
+
+		/*
+		 * withold attribute information to owner or root
+		 */
+		if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * now its attributes
+		 */
+		if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+			return (error);
+		}
+
+		/*
+		 * if it's a file in lx /proc/pid/fd/xx then set its
+		 * mode and keep it looking like a symlink
+		 */
+		if (type == LXPR_PID_FD_FD) {
+			vap->va_mode = lxpnp->lxpr_mode;
+			vap->va_type = vp->v_type;
+			vap->va_size = 0;
+			vap->va_nlink = 1;
+		}
+		return (0);
+	}
+
+	/* Default attributes, that may be overridden below */
+	bzero(vap, sizeof (*vap));
+	vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+	vap->va_nlink = 1;
+	vap->va_type = vp->v_type;
+	vap->va_mode = lxpnp->lxpr_mode;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_blksize = DEV_BSIZE;
+	vap->va_uid = lxpnp->lxpr_uid;
+	vap->va_gid = lxpnp->lxpr_gid;
+	vap->va_nodeid = lxpnp->lxpr_ino;
+
+	switch (type) {
+	case LXPR_PROCDIR:
+		vap->va_nlink = nproc + 2 + PROCDIRFILES;
+		vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+		break;
+	case LXPR_PIDDIR:
+		vap->va_nlink = PIDDIRFILES;
+		vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+		break;
+	case LXPR_SELF:
+		vap->va_uid = crgetruid(curproc->p_cred);
+		vap->va_gid = crgetrgid(curproc->p_cred);
+		break;
+	default:
+		break;
+	}
+
+	vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+	return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	int shift = 0;
+	proc_t *tp;
+
+	/* lx /proc is a read only file system */
+	if (mode & VWRITE)
+		return (EROFS);
+
+	/*
+	 * If this is a restricted file, check access permissions.
+	 */
+	switch (lxpnp->lxpr_type) {
+	case LXPR_PIDDIR:
+		return (0);
+	case LXPR_PID_CURDIR:
+	case LXPR_PID_ENV:
+	case LXPR_PID_EXE:
+	case LXPR_PID_MAPS:
+	case LXPR_PID_MEM:
+	case LXPR_PID_ROOTDIR:
+	case LXPR_PID_FDDIR:
+	case LXPR_PID_FD_FD:
+		if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+			return (ENOENT);
+		if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+		    priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+			lxpr_unlock(tp);
+			return (EACCES);
+		}
+		lxpr_unlock(tp);
+	default:
+		break;
+	}
+
+	if (lxpnp->lxpr_realvp != NULL) {
+		/*
+		 * For these we use the underlying vnode's accessibility.
+		 */
+		return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+	}
+
+	/* If user is root allow access regardless of permission bits */
+	if (secpolicy_proc_access(cr) == 0)
+		return (0);
+
+	/*
+	 * Access check is based on only one of owner, group, public.  If not
+	 * owner, then check group.  If not a member of the group, then check
+	 * public access.
+	 */
+	if (crgetuid(cr) != lxpnp->lxpr_uid) {
+		shift += 3;
+		if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+			shift += 3;
+	}
+
+	mode &= ~(lxpnp->lxpr_mode << shift);
+
+	if (mode == 0)
+		return (0);
+
+	return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+	return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+	int *direntflags, pathname_t *realpnp)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the lookup
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict lookup permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Just return the parent vnode if that's where we are trying to go.
+	 */
+	if (strcmp(comp, "..") == 0) {
+		VN_HOLD(lxpnp->lxpr_parent);
+		*vpp = lxpnp->lxpr_parent;
+		return (0);
+	}
+
+	/*
+	 * Special handling for directory searches.  Note: null component name
+	 * denotes that the current directory is being searched.
+	 */
+	if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+		VN_HOLD(dp);
+		*vpp = dp;
+		return (0);
+	}
+
+	*vpp = (lxpr_lookup_function[type](dp, comp));
+	return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	lxpr_node_t *lxpnp;
+	int count;
+
+	for (count = 0; count < dirtablen; count++) {
+		if (strcmp(dirtab[count].d_name, comp) == 0) {
+			lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+			dp = LXPTOV(lxpnp);
+			ASSERT(dp != NULL);
+			return (dp);
+		}
+	}
+	return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+	proc_t *p;
+
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+	p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+	if (p == NULL)
+		return (NULL);
+
+	dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+	lxpr_unlock(p);
+
+	return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+	lxpr_node_t *dlxpnp = VTOLXP(dp);
+	lxpr_node_t *lxpnp;
+	vnode_t *vp = NULL;
+	proc_t *p;
+	file_t *fp;
+	uint_t fd;
+	int c;
+	uf_entry_t *ufp;
+	uf_info_t *fip;
+
+	ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	/*
+	 * convert the string rendition of the filename
+	 * to a file descriptor
+	 */
+	fd = 0;
+	while ((c = *comp++) != '\0') {
+		int ofd;
+		if (c < '0' || c > '9')
+			return (NULL);
+
+		ofd = fd;
+		fd = 10*fd + c - '0';
+		/* integer overflow */
+		if (fd / 10 != ofd)
+			return (NULL);
+	}
+
+	/*
+	 * get the proc to work with and lock it
+	 */
+	p = lxpr_lock(dlxpnp->lxpr_pid);
+	if ((p == NULL))
+		return (NULL);
+
+	/*
+	 * If the process is a zombie or system process
+	 * it can't have any open files.
+	 */
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		lxpr_unlock(p);
+		return (NULL);
+	}
+
+	/*
+	 * get us a fresh node/vnode
+	 */
+	lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+	/*
+	 * get open file info
+	 */
+	fip = (&(p)->p_user.u_finfo);
+	mutex_enter(&fip->fi_lock);
+
+	/*
+	 * got the fd data so now done with this proc
+	 */
+	lxpr_unlock(p);
+
+	if (fd < fip->fi_nfiles) {
+		UF_ENTER(ufp, fip, fd);
+		/*
+		 * ensure the fd is still kosher.
+		 * it may have gone between the readdir and
+		 * the lookup
+		 */
+		if (fip->fi_list[fd].uf_file == NULL) {
+			mutex_exit(&fip->fi_lock);
+			UF_EXIT(ufp);
+			lxpr_freenode(lxpnp);
+			return (NULL);
+		}
+
+		if ((fp = ufp->uf_file) != NULL)
+			vp = fp->f_vnode;
+		UF_EXIT(ufp);
+	}
+	mutex_exit(&fip->fi_lock);
+
+	if (vp == NULL) {
+		lxpr_freenode(lxpnp);
+		return (NULL);
+	} else {
+		/*
+		 * Fill in the lxpr_node so future references will be able to
+		 * find the underlying vnode. The vnode is held on the realvp.
+		 */
+		lxpnp->lxpr_realvp = vp;
+		VN_HOLD(lxpnp->lxpr_realvp);
+	}
+
+	dp = LXPTOV(lxpnp);
+	ASSERT(dp != NULL);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+	dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+	return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+	ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+	/*
+	 * We know all the names of files & dirs in our file system structure
+	 * except those that are pid names.  These change as pids are created/
+	 * deleted etc., so we just look for a number as the first char to see
+	 * if we are we doing pid lookups.
+	 *
+	 * Don't need to check for "self" as it is implemented as a symlink
+	 */
+	if (*comp >= '0' && *comp <= '9') {
+		pid_t pid = 0;
+		lxpr_node_t *lxpnp = NULL;
+		proc_t *p;
+		int c;
+
+		while ((c = *comp++) != '\0')
+			pid = 10 * pid + c - '0';
+
+		/*
+		 * Can't continue if the process is still loading or it doesn't
+		 * really exist yet (or maybe it just died!)
+		 */
+		p = lxpr_lock(pid);
+		if (p == NULL)
+			return (NULL);
+
+		if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			lxpr_unlock(p);
+			return (NULL);
+		}
+
+		/*
+		 * allocate and fill in a new lxpr node
+		 */
+		lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+		lxpr_unlock(p);
+
+		dp = LXPTOV(lxpnp);
+		ASSERT(dp != NULL);
+
+		return (dp);
+	}
+
+	/* Lookup fixed names */
+	return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+	caller_context_t *ct, int flags)
+{
+	lxpr_node_t *lxpnp = VTOLXP(dp);
+	lxpr_nodetype_t type = lxpnp->lxpr_type;
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+
+	ASSERT(dp->v_type == VDIR);
+	ASSERT(type >= 0 && type < LXPR_NFILES);
+
+	/*
+	 * we should never get here because the readdir
+	 * is done on the realvp for these nodes
+	 */
+	ASSERT(type != LXPR_PID_FD_FD &&
+	    type != LXPR_PID_CURDIR &&
+	    type != LXPR_PID_ROOTDIR);
+
+	/*
+	 * restrict readdir permission to owner or root
+	 */
+	if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+		return (error);
+
+	uoffset = uiop->uio_offset;
+	uresid = uiop->uio_resid;
+
+	/* can't do negative reads */
+	if (uoffset < 0 || uresid <= 0)
+		return (EINVAL);
+
+	/* can't read directory entries that don't exist! */
+	if (uoffset % LXPR_SDSIZE)
+		return (ENOENT);
+
+	return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+    lxpr_dirent_t *dirtab, int dirtablen)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+
+	oresid = uiop->uio_resid;
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Satisfy user request
+	 */
+	while ((uresid = uiop->uio_resid) > 0) {
+		int dirindex;
+		off_t uoffset;
+		int reclen;
+		int error;
+
+		uoffset = uiop->uio_offset;
+		dirindex  = (uoffset / LXPR_SDSIZE) - 2;
+
+		if (uoffset == 0) {
+
+			dirent->d_ino = lxpnp->lxpr_ino;
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '\0';
+			reclen = DIRENT64_RECLEN(1);
+
+		} else if (uoffset == LXPR_SDSIZE) {
+
+			dirent->d_ino = lxpr_parentinode(lxpnp);
+			dirent->d_name[0] = '.';
+			dirent->d_name[1] = '.';
+			dirent->d_name[2] = '\0';
+			reclen = DIRENT64_RECLEN(2);
+
+		} else if (dirindex < dirtablen) {
+			int slen = strlen(dirtab[dirindex].d_name);
+
+			dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+			    lxpnp->lxpr_pid, 0);
+
+			ASSERT(slen < LXPNSIZ);
+			(void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+			reclen = DIRENT64_RECLEN(slen);
+
+		} else {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid) {
+				return (EINVAL);
+			}
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	/* Have run out of space, but could have just done last table entry */
+	if (eofp) {
+		*eofp =
+		    (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+	return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	zoneid_t zoneid;
+	pid_t pid;
+	int error;
+	int ceof;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+	oresid = uiop->uio_resid;
+	zoneid = LXPTOZ(lxpnp)->zone_id;
+
+	/*
+	 * We return directory entries in the order: "." and ".." then the
+	 * unique lxproc files, then the directories corresponding to the
+	 * running processes.  We have defined this as the ordering because
+	 * it allows us to more easily keep track of where we are betwen calls
+	 * to getdents().  If the number of processes changes between calls
+	 * then we can't lose track of where we are in the lxproc files.
+	 */
+
+	/* Do the fixed entries */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+	    PROCDIRFILES);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/* Do the process entries */
+	while ((uresid = uiop->uio_resid) > 0) {
+		proc_t *p;
+		int len;
+		int reclen;
+		int i;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop when entire proc table has been examined.
+		 */
+		i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+		if (i >= v.v_proc) {
+			/* Run out of table entries */
+			if (eofp) {
+				*eofp = 1;
+			}
+			return (0);
+		}
+		mutex_enter(&pidlock);
+
+		/*
+		 * Skip indices for which there is no pid_entry, PIDs for
+		 * which there is no corresponding process, a PID of 0,
+		 * and anything the security policy doesn't allow
+		 * us to look at.
+		 */
+		if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+		    p->p_pid == 0 ||
+		    secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+			mutex_exit(&pidlock);
+			goto next;
+		}
+		mutex_exit(&pidlock);
+
+		/*
+		 * Convert pid to the Linux default of 1 if we're the zone's
+		 * init process, otherwise use the value from the proc
+		 * structure
+		 */
+		pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+		    p->p_pid : 1);
+
+		/*
+		 * If this /proc was mounted in the global zone, view
+		 * all procs; otherwise, only view zone member procs.
+		 */
+		if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+			goto next;
+		}
+
+		ASSERT(p->p_stat != 0);
+
+		dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		/*
+		 * if the size of the data to transfer is greater
+		 * that that requested then we can't do it this transfer.
+		 */
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				return (EINVAL);
+			break;
+		}
+
+		/*
+		 * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+		 * by the same amount.  But we want uiop->uio_offset to change
+		 * in increments of LXPR_SDSIZE, which is different from the
+		 * number of bytes being returned to the user.  So we set
+		 * uiop->uio_offset separately, in the increment of this for
+		 * the loop, ignoring what uiomove() does.
+		 */
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			return (error);
+next:
+		uiop->uio_offset = uoffset + LXPR_SDSIZE;
+	}
+
+	if (eofp != NULL) {
+		*eofp = (uiop->uio_offset >=
+		    ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+	return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	proc_t *p;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+	/* can't read its contents if it died */
+	mutex_enter(&pidlock);
+
+	p = prfind((lxpnp->lxpr_pid == 1) ?
+	    curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+	if (p == NULL || p->p_stat == SIDL) {
+		mutex_exit(&pidlock);
+		return (ENOENT);
+	}
+	mutex_exit(&pidlock);
+
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+	return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+	/* bp holds one dirent64 structure */
+	longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+	dirent64_t *dirent = (dirent64_t *)bp;
+	ssize_t oresid;	/* save a copy for testing later */
+	ssize_t uresid;
+	off_t uoffset;
+	int error;
+	int ceof;
+	proc_t *p;
+	int fddirsize;
+	uf_info_t *fip;
+
+	ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+	oresid = uiop->uio_resid;
+
+	/* can't read its contents if it died */
+	p = lxpr_lock(lxpnp->lxpr_pid);
+	if (p == NULL)
+		return (ENOENT);
+
+	/* Get open file info */
+	fip = (&(p)->p_user.u_finfo);
+
+	if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+		fddirsize = 0;
+	} else {
+		fddirsize = fip->fi_nfiles;
+	}
+
+	mutex_enter(&fip->fi_lock);
+	lxpr_unlock(p);
+
+	/* Do the fixed entries (in this case just "." & "..") */
+	error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+	/* Finished if we got an error or if we couldn't do all the table */
+	if (error != 0 || ceof == 0)
+		return (error);
+
+	/* clear out the dirent buffer */
+	bzero(bp, sizeof (bp));
+
+	/*
+	 * Loop until user's request is satisfied or until
+	 * all file descriptors have been examined.
+	 */
+	for (; (uresid = uiop->uio_resid) > 0;
+	    uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+		int reclen;
+		int fd;
+		int len;
+
+		uoffset = uiop->uio_offset;
+
+		/*
+		 * Stop at the end of the fd list
+		 */
+		fd = (uoffset / LXPR_SDSIZE) - 2;
+		if (fd >= fddirsize) {
+			if (eofp) {
+				*eofp = 1;
+			}
+			goto out;
+		}
+
+		if (fip->fi_list[fd].uf_file == NULL)
+			continue;
+
+		dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+		len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+		ASSERT(len < LXPNSIZ);
+		reclen = DIRENT64_RECLEN(len);
+
+		dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+		dirent->d_reclen = (ushort_t)reclen;
+
+		if (reclen > uresid) {
+			/*
+			 * Error if no entries have been returned yet.
+			 */
+			if (uresid == oresid)
+				error = EINVAL;
+			goto out;
+		}
+
+		if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+		    uiop)) != 0)
+			goto out;
+	}
+
+	if (eofp != NULL) {
+		*eofp =
+		    (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+	}
+
+out:
+	mutex_exit(&fip->fi_lock);
+	return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	char bp[MAXPATHLEN + 1];
+	size_t buflen = sizeof (bp);
+	lxpr_node_t *lxpnp = VTOLXP(vp);
+	vnode_t *rvp = lxpnp->lxpr_realvp;
+	pid_t pid;
+	int error = 0;
+
+	/* must be a symbolic link file */
+	if (vp->v_type != VLNK)
+		return (EINVAL);
+
+	/* Try to produce a symlink name for anything that has a realvp */
+	if (rvp != NULL) {
+		if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+			return (error);
+		if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+			return (error);
+	} else {
+		switch (lxpnp->lxpr_type) {
+		case LXPR_SELF:
+			/*
+			 * Convert pid to the Linux default of 1 if we're the
+			 * zone's init process
+			 */
+			pid = ((curproc->p_pid !=
+			    curproc->p_zone->zone_proc_initpid)
+			    ? curproc->p_pid : 1);
+
+			/*
+			 * Don't need to check result as every possible int
+			 * will fit within MAXPATHLEN bytes.
+			 */
+			(void) snprintf(bp, buflen, "%d", pid);
+			break;
+		case LXPR_PID_CURDIR:
+		case LXPR_PID_ROOTDIR:
+		case LXPR_PID_EXE:
+			return (EACCES);
+		default:
+			/*
+			 * Need to return error so that nothing thinks
+			 * that the symlink is empty and hence "."
+			 */
+			return (EINVAL);
+		}
+	}
+
+	/* copy the link data to user space */
+	return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+	/*
+	 * Nothing to sync but this function must never fail
+	 */
+	return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	while (vn_matchops(vp1, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+		vp1 = rvp;
+	}
+
+	while (vn_matchops(vp2, lxpr_vnodeops) &&
+	    (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+		vp2 = rvp;
+	}
+
+	if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+		return (vp1 == vp2);
+
+	return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+	vnode_t *rvp;
+
+	if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+		vp = rvp;
+		if (VOP_REALVP(vp, &rvp, ct) == 0)
+			vp = rvp;
+	}
+
+	*vpp = vp;
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..a06bef1570
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,275 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_LXPROC_H
+#define	_LXPROC_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define	LX_SIGHUP	1
+#define	LX_SIGINT	2
+#define	LX_SIGQUIT	3
+#define	LX_SIGILL	4
+#define	LX_SIGTRAP	5
+#define	LX_SIGABRT	6
+#define	LX_SIGIOT	6
+#define	LX_SIGBUS	7
+#define	LX_SIGFPE	8
+#define	LX_SIGKILL	9
+#define	LX_SIGUSR1	10
+#define	LX_SIGSEGV	11
+#define	LX_SIGUSR2	12
+#define	LX_SIGPIPE	13
+#define	LX_SIGALRM	14
+#define	LX_SIGTERM	15
+#define	LX_SIGSTKFLT	16
+#define	LX_SIGCHLD	17
+#define	LX_SIGCONT	18
+#define	LX_SIGSTOP	19
+#define	LX_SIGTSTP	20
+#define	LX_SIGTTIN	21
+#define	LX_SIGTTOU	22
+#define	LX_SIGURG	23
+#define	LX_SIGXCPU	24
+#define	LX_SIGXFSZ	25
+#define	LX_SIGVTALRM	26
+#define	LX_SIGPROF	27
+#define	LX_SIGWINCH	28
+#define	LX_SIGIO	29
+#define	LX_SIGPOLL	LX_SIGIO
+#define	LX_SIGPWR	30
+#define	LX_SIGSYS	31
+#define	LX_SIGUNUSED	31
+
+#define	LX_NSIG_WORDS	2
+#define	LX_NBPW		32
+#define	LX_NSIG		((LX_NBPW * LX_NSIG_WORDS) + 1)
+
+#define	LX_SIGRTMIN	32
+#define	LX_SIGRTMAX	LX_NSIG - 1
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define	VTOLXPM(vp)	((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define	VTOLXP(vp)	((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define	LXPTOV(lxpnp)	((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define	LXPTOZ(lxpnp) \
+	(((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define	LXPNSIZ		256	/* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define	LXPR_SDSIZE	16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+	LXPR_PROCDIR,		/* /proc		*/
+	LXPR_PIDDIR,		/* /proc/<pid>		*/
+	LXPR_PID_CMDLINE,	/* /proc/<pid>/cmdline	*/
+	LXPR_PID_CPU,		/* /proc/<pid>/cpu	*/
+	LXPR_PID_CURDIR,	/* /proc/<pid>/cwd	*/
+	LXPR_PID_ENV,		/* /proc/<pid>/environ	*/
+	LXPR_PID_EXE,		/* /proc/<pid>/exe	*/
+	LXPR_PID_MAPS,		/* /proc/<pid>/maps	*/
+	LXPR_PID_MEM,		/* /proc/<pid>/mem	*/
+	LXPR_PID_ROOTDIR,	/* /proc/<pid>/root	*/
+	LXPR_PID_STAT,		/* /proc/<pid>/stat	*/
+	LXPR_PID_STATM,		/* /proc/<pid>/statm	*/
+	LXPR_PID_STATUS,	/* /proc/<pid>/status	*/
+	LXPR_PID_FDDIR,		/* /proc/<pid>/fd	*/
+	LXPR_PID_FD_FD,		/* /proc/<pid>/fd/nn	*/
+	LXPR_CMDLINE,		/* /proc/cmdline	*/
+	LXPR_CPUINFO,		/* /proc/cpuinfo	*/
+	LXPR_DEVICES,		/* /proc/devices	*/
+	LXPR_DMA,		/* /proc/dma		*/
+	LXPR_FILESYSTEMS,	/* /proc/filesystems	*/
+	LXPR_INTERRUPTS,	/* /proc/interrupts	*/
+	LXPR_IOPORTS,		/* /proc/ioports	*/
+	LXPR_KCORE,		/* /proc/kcore		*/
+	LXPR_KMSG,		/* /proc/kmsg		*/
+	LXPR_LOADAVG,		/* /proc/loadavg	*/
+	LXPR_MEMINFO,		/* /proc/meminfo	*/
+	LXPR_MOUNTS,		/* /proc/mounts		*/
+	LXPR_NETDIR,		/* /proc/net		*/
+	LXPR_NET_ARP,		/* /proc/net/arp	*/
+	LXPR_NET_DEV,		/* /proc/net/dev	*/
+	LXPR_NET_DEV_MCAST,	/* /proc/net/dev_mcast	*/
+	LXPR_NET_IGMP,		/* /proc/net/igmp	*/
+	LXPR_NET_IP_MR_CACHE,	/* /proc/net/ip_mr_cache */
+	LXPR_NET_IP_MR_VIF,	/* /proc/net/ip_mr_vif	*/
+	LXPR_NET_MCFILTER,	/* /proc/net/mcfilter	*/
+	LXPR_NET_NETSTAT,	/* /proc/net/netstat	*/
+	LXPR_NET_RAW,		/* /proc/net/raw	*/
+	LXPR_NET_ROUTE,		/* /proc/net/route	*/
+	LXPR_NET_RPC,		/* /proc/net/rpc	*/
+	LXPR_NET_RT_CACHE,	/* /proc/net/rt_cache	*/
+	LXPR_NET_SOCKSTAT,	/* /proc/net/sockstat	*/
+	LXPR_NET_SNMP,		/* /proc/net/snmp	*/
+	LXPR_NET_STAT,		/* /proc/net/stat	*/
+	LXPR_NET_TCP,		/* /proc/net/tcp	*/
+	LXPR_NET_UDP,		/* /proc/net/udp	*/
+	LXPR_NET_UNIX,		/* /proc/net/unix	*/
+	LXPR_PARTITIONS,	/* /proc/partitions	*/
+	LXPR_SELF,		/* /proc/self		*/
+	LXPR_STAT,		/* /proc/stat		*/
+	LXPR_UPTIME,		/* /proc/uptime		*/
+	LXPR_VERSION,		/* /proc/version	*/
+	LXPR_NFILES		/* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define	LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define	LXPRMAXNAMELEN	14
+typedef struct {
+	lxpr_nodetype_t	d_type;
+	char		d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+	lxpr_nodetype_t	lxpr_type;	/* type of this node 		*/
+	vnode_t		*lxpr_vnode;	/* vnode for the node		*/
+	vnode_t		*lxpr_parent;	/* parent directory		*/
+	vnode_t		*lxpr_realvp;	/* real vnode, file in dirs	*/
+	timestruc_t	lxpr_time;	/* creation etc time for file	*/
+	mode_t		lxpr_mode;	/* file mode bits		*/
+	uid_t		lxpr_uid;	/* file owner			*/
+	gid_t		lxpr_gid;	/* file group owner		*/
+	pid_t		lxpr_pid;	/* pid of proc referred to	*/
+	ino_t		lxpr_ino;	/* node id 			*/
+	ldi_handle_t	lxpr_cons_ldih; /* ldi handle for console device */
+} lxpr_node_t;
+
+struct zone;    /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+	lxpr_node_t	*lxprm_node;	/* node at root of proc mount */
+	struct zone	*lxprm_zone;	/* zone for this mount */
+	ldi_ident_t	lxprm_li;	/* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t	*lxpr_vnodeops;
+extern int		nproc_highbit;	/* highbit(v.v_nproc)		*/
+
+typedef struct mounta	mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index d31b53d2e9..4e5882ad7c 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -28,6 +28,10 @@
  *	All rights reserved.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -2298,6 +2302,12 @@ top:
 						vattr.va_mask = AT_SIZE;
 						error = nfs3setattr(vp,
 						    &vattr, 0, cr);
+
+						/*
+						 * Existing file was truncated;
+						 * emit a create event.
+						 */
+						vnevent_create(vp, ct);
 					}
 				}
 			}
@@ -2306,12 +2316,9 @@ top:
 		if (error) {
 			VN_RELE(vp);
 		} else {
-			/*
-			 * existing file got truncated, notify.
-			 */
-			vnevent_create(vp, ct);
 			*vpp = vp;
 		}
+
 		return (error);
 	}
 
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 5ae2c28d53..f05a0717d9 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -31,6 +31,10 @@
  *	All Rights Reserved
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -6653,16 +6657,20 @@ top:
 	} else {
 		vnode_t *tvp;
 		rnode4_t *trp;
-		/*
-		 * existing file got truncated, notify.
-		 */
 		tvp = vp;
 		if (vp->v_type == VREG) {
 			trp = VTOR4(vp);
 			if (IS_SHADOW(vp, trp))
 				tvp = RTOV4(trp);
 		}
-		vnevent_create(tvp, ct);
+
+		if (must_trunc) {
+			/*
+			 * existing file got truncated, notify.
+			 */
+			vnevent_create(tvp, ct);
+		}
+
 		*vpp = vp;
 	}
 	return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index a0abad0700..22d1ad4d68 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  */
 
@@ -2521,6 +2522,9 @@ nfs_srvinit(void)
 {
 	int error;
 
+	if (getzoneid() != GLOBAL_ZONEID)
+		return (EACCES);
+
 	error = nfs_exportinit();
 	if (error != 0)
 		return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index a3f43a4e95..fa31e3693f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -25,6 +25,10 @@
  *	All rights reserved.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -2030,6 +2034,14 @@ nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
 				    vp->v_type == VREG) {
 					vattr.va_mask = AT_SIZE;
 					error = nfssetattr(vp, &vattr, 0, cr);
+
+					if (!error) {
+						/*
+						 * Existing file was truncated;
+						 * emit a create event.
+						 */
+						vnevent_create(vp, ct);
+					}
 				}
 			}
 		}
@@ -2037,10 +2049,6 @@ nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
 		if (error) {
 			VN_RELE(vp);
 		} else {
-			/*
-			 * existing file got truncated, notify.
-			 */
-			vnevent_create(vp, ct);
 			*vpp = vp;
 		}
 		return (error);
diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c
index 2852a98f52..48792394a5 100644
--- a/usr/src/uts/common/fs/portfs/port_fop.c
+++ b/usr/src/uts/common/fs/portfs/port_fop.c
@@ -23,6 +23,9 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
 
 /*
  * File Events Notification
@@ -1965,7 +1968,9 @@ port_fop(vnode_t *vp, int op, int retval)
 	if (op & FOP_ATTRIB_MASK) {
 		event  |= FILE_ATTRIB;
 	}
-
+	if (op & FOP_TRUNC_MASK) {
+		event  |= FILE_TRUNC;
+	}
 	if (event) {
 		port_fop_sendevent(vp, 	event, NULL, NULL);
 	}
@@ -2147,6 +2152,9 @@ port_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
 	int		events = 0;
 
 	retval = vnext_setattr(vf, vap, flags, cr, ct);
+	if (vap->va_mask & AT_SIZE) {
+		events |= FOP_FILE_TRUNC;
+	}
 	if (vap->va_mask & (AT_SIZE|AT_MTIME)) {
 		events |= FOP_FILE_SETATTR_MTIME;
 	}
@@ -2322,8 +2330,8 @@ port_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
 			port_fop_sendevent(vp, FILE_DELETE, dvp, name);
 		break;
 	case	VE_CREATE:
-			port_fop_sendevent(vp, FILE_MODIFIED|FILE_ATTRIB,
-			    NULL, NULL);
+			port_fop_sendevent(vp,
+			    FILE_MODIFIED|FILE_ATTRIB|FILE_TRUNC, NULL, NULL);
 		break;
 	case	VE_LINK:
 			port_fop_sendevent(vp, FILE_ATTRIB, NULL, NULL);
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 55a48bb2cc..53709139cc 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/param.h>
@@ -935,7 +939,7 @@ pr_control32(int32_t cmd, arg32_t *argp, prnode_t *pnp, cred_t *cr)
 
 	case PCREAD:	/* read from the address space */
 	case PCWRITE:	/* write to the address space */
-		if (PROCESS_NOT_32BIT(p))
+		if (PROCESS_NOT_32BIT(p) || (pnp->pr_flags & PR_OFFMAX))
 			error = EOVERFLOW;
 		else {
 			enum uio_rw rw = (cmd == PCREAD)? UIO_READ : UIO_WRITE;
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 1294421f9f..ce925778f2 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -183,6 +187,7 @@ typedef struct prnode {
 #define	PR_INVAL	0x01		/* vnode is invalidated */
 #define	PR_ISSELF	0x02		/* vnode is a self-open */
 #define	PR_AOUT		0x04		/* vnode is for an a.out path */
+#define	PR_OFFMAX	0x08		/* vnode is a large file open */
 
 /*
  * Conversion macros.
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index a3e95a60fc..7831c1f9ea 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -23,6 +23,10 @@
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 /*	Copyright (c) 1984,	 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -337,6 +341,15 @@ propen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 	}
 
 	/*
+	 * If this is a large file open, indicate that in our flags -- some
+	 * procfs structures are not off_t-neutral (e.g., priovec_t), and
+	 * the open will need to be differentiated where 32-bit processes
+	 * pass these structures across the user/kernel boundary.
+	 */
+	if (flag & FOFFMAX)
+		pnp->pr_flags |= PR_OFFMAX;
+
+	/*
 	 * Do file-specific things.
 	 */
 	switch (type) {
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
 		 * memory that can be used as swap space should do so by
 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
 		 * However, swapfs_minfree is tunable by install as a
-		 * workaround for bugid 1147463.
+		 * workaround for bugid 1147463. Note swapfs_minfree is set
+		 * to 1/8th of memory, but clamped at the limit of 256 MB.
 		 */
-		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+		new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+		    btopr(256 * 1024 * 1024));
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..f22cc3ecf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -76,7 +77,7 @@ static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	"tmpfs",
 	tmpfsinit,
-	VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+	VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
 	&tmpfs_proto_opttbl
 };
 
@@ -249,7 +250,7 @@ tmp_mount(
 		return (ENOTDIR);
 
 	mutex_enter(&mvp->v_lock);
-	if ((uap->flags & MS_OVERLAY) == 0 &&
+	if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (EBUSY);
@@ -286,6 +287,21 @@ tmp_mount(
 	    (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
 		goto out;
 
+	if (uap->flags & MS_REMOUNT) {
+		tm = (struct tmount *)VFSTOTM(vfsp);
+
+		/*
+		 * If we change the size so its less than what is currently
+		 * being used, we allow that. The file system will simply be
+		 * full until enough files have been removed to get below the
+		 * new max.
+		 */
+		mutex_enter(&tm->tm_contents);
+		tm->tm_anonmax = anonmax;
+		mutex_exit(&tm->tm_contents);
+		goto out;
+	}
+
 	if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
 		pn_free(&dpn);
 		error = ENOMEM;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index 61d72a4015..461016aa52 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/t_lock.h>
@@ -978,6 +982,8 @@ again:
 	}
 
 	if (error == 0) {	/* name found */
+		boolean_t trunc = B_FALSE;
+
 		ASSERT(oldtp);
 
 		rw_enter(&oldtp->tn_rwlock, RW_WRITER);
@@ -1005,6 +1011,7 @@ again:
 			rw_enter(&oldtp->tn_contents, RW_WRITER);
 			(void) tmpnode_trunc(tm, oldtp, 0);
 			rw_exit(&oldtp->tn_contents);
+			trunc = B_TRUE;
 		}
 		rw_exit(&oldtp->tn_rwlock);
 		if (IS_DEVVP(*vpp)) {
@@ -1019,9 +1026,9 @@ again:
 			*vpp = newvp;
 		}
 
-		if (error == 0) {
+		if (trunc)
 			vnevent_create(*vpp, ct);
-		}
+
 		return (0);
 	}
 
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index 83c53d859d..8d5c741428 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -1129,6 +1130,7 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
 	struct pathname	pn, rpn;
 	vsk_anchor_t	*vskap;
 	char fstname[FSTYPSZ];
+	zone_t		*zone;
 
 	/*
 	 * The v_flag value for the mount point vp is permanently set
@@ -1590,9 +1592,24 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
 	}
 
 	/*
-	 * Serialize with zone creations.
+	 * Serialize with zone state transitions.
+	 * See vfs_list_add; zone mounted into is:
+	 *     zone_find_by_path(refstr_value(vfsp->vfs_mntpt))
+	 * not the zone doing the mount (curproc->p_zone), but if we're already
+	 * inside a NGZ, then we know what zone we are.
 	 */
-	mount_in_progress();
+	if (INGLOBALZONE(curproc)) {
+		zone = zone_find_by_path(mountpt);
+		ASSERT(zone != NULL);
+	} else {
+		zone = curproc->p_zone;
+		/*
+		 * zone_find_by_path does a hold, so do one here too so that
+		 * we can do a zone_rele after mount_completed.
+		 */
+		zone_hold(zone);
+	}
+	mount_in_progress(zone);
 	/*
 	 * Instantiate (or reinstantiate) the file system.  If appropriate,
 	 * splice it into the file system name space.
@@ -1761,7 +1778,8 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
 
 		vfs_unlock(vfsp);
 	}
-	mount_completed();
+	mount_completed(zone);
+	zone_rele(zone);
 	if (splice)
 		vn_vfsunlock(vp);
 
@@ -3881,6 +3899,8 @@ vfs_to_modname(const char *vfstype)
 		vfstype = "fdfs";
 	} else if (strncmp(vfstype, "nfs", 3) == 0) {
 		vfstype = "nfs";
+	} else if (strcmp(vfstype, "lxproc") == 0) {
+		vfstype = "lxprocfs";
 	}
 
 	return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 382369c7fc..67f21866ec 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -65,6 +66,7 @@
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
+#include <sys/time.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
@@ -199,6 +201,11 @@ static void 		(**vsd_destructor)(void *);
 		cr = crgetmapped(cr);					\
 	}
 
+#define	VOP_LATENCY_10MS	10000000
+#define	VOP_LATENCY_100MS	100000000
+#define	VOP_LATENCY_1S		1000000000
+#define	VOP_LATENCY_10S		10000000000
+
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
@@ -3220,14 +3227,57 @@ fop_read(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int err;
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_runq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, read,
-	    read_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.reads++;
+		zonep->zone_vfs_rwstats.nread += len;
+		kstat_runq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
@@ -3239,14 +3289,62 @@ fop_write(
 	cred_t *cr,
 	caller_context_t *ct)
 {
-	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
+	zone_t	*zonep = curzone;
+	zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+	hrtime_t start, lat;
+	ssize_t len;
+	int	err;
+
+	/*
+	 * For the purposes of VFS kstat consumers, the "waitq" calculation is
+	 * repurposed as the active queue for VFS write operations.  There's no
+	 * actual wait queue for VFS operations.
+	 */
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		start = gethrtime();
+
+		mutex_enter(&zonep->zone_vfs_lock);
+		kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
-	VOPSTATS_UPDATE_IO(vp, write,
-	    write_bytes, (resid_start - uiop->uio_resid));
+	len = resid_start - uiop->uio_resid;
+
+	VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+	if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+		mutex_enter(&zonep->zone_vfs_lock);
+		zonep->zone_vfs_rwstats.writes++;
+		zonep->zone_vfs_rwstats.nwritten += len;
+		kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+		mutex_exit(&zonep->zone_vfs_lock);
+
+		lat = gethrtime() - start;
+
+		if (lat >= VOP_LATENCY_10MS) {
+			if (lat < VOP_LATENCY_100MS)
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+			else if (lat < VOP_LATENCY_1S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+			} else if (lat < VOP_LATENCY_10S) {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+			} else {
+				atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+				atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+			}
+		}
+	}
+
 	return (err);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 5b0e464ac5..d8e9f26bdb 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
@@ -125,6 +126,7 @@
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
@@ -2146,6 +2148,16 @@ arc_reclaim_needed(void)
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
+	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	if (availrmem <= pages_pp_maximum)
+		return (1);
+
 #if defined(__i386)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
@@ -3059,6 +3071,14 @@ top:
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
+		/*
+		 * At this point, this read I/O has already missed in the ARC
+		 * and will be going through to the disk.  The I/O throttle
+		 * should delay this I/O if this zone is using more than its I/O
+		 * priority allows.
+		 */
+		zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
 		if (*arc_flags & ARC_WAIT)
 			return (zio_wait(rzio));
 
@@ -3637,9 +3657,6 @@ arc_init(void)
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
 		arc_meta_limit = zfs_arc_meta_limit;
 
-	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
-		arc_c_min = arc_meta_limit / 2;
-
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index ac9e3b28f6..e8bf55c321 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2721,7 +2721,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index fa49735c87..e76074bf8d 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
@@ -964,6 +965,7 @@ xuio_stat_wbuf_nocopy()
 }
 
 #ifdef _KERNEL
+
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
@@ -1576,7 +1578,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 
 	if (wp & WP_NOFILL) {
 		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
+		checksum = ZIO_CHECKSUM_NOPARITY;
 		compress = ZIO_COMPRESS_OFF;
 		dedup = B_FALSE;
 	}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index e44786f163..a9308b0c08 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -38,11 +38,11 @@
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
+#include <sys/zfs_zone.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
-
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
@@ -222,6 +222,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	if (len == 0)
 		return;
 
+	zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
 	min_bs = SPA_MINBLOCKSHIFT;
 	max_bs = SPA_MAXBLOCKSHIFT;
 	min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index df3f02b1df..e7e11dc296 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -37,6 +37,7 @@
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
 #include "zfs_namecheck.h"
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -833,7 +834,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    zfs_zone_txg_delay());
 			err = ERESTART;
 		}
 		dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 8ab6655b6f..02ce0d15c3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -40,6 +40,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
@@ -610,11 +611,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
 
 	/*
 	 * If this transaction group is over 7/8ths capacity, delay
-	 * the caller 1 clock tick.  This will slow down the "fill"
-	 * rate until the sync process can catch up with us.
+	 * the caller some number of clock ticks.  This will slow down the
+	 * "fill" rate until the sync process can catch up with us.
 	 */
 	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
+		txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay());
 
 	return (0);
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
index b748571ea0..ffca0a7dcb 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define	_SYS_VDEV_DISK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
@@ -40,14 +39,22 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
+#endif
 
+extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 95b8f9bdaf..e4c02bde1d 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -104,6 +104,7 @@ struct vdev_queue {
 	avl_tree_t	vq_read_tree;
 	avl_tree_t	vq_write_tree;
 	avl_tree_t	vq_pending_tree;
+	zoneid_t	vq_last_zone_id;
 	uint64_t	vq_io_complete_ts;
 	uint64_t	vq_io_delta_ts;
 	kmutex_t	vq_lock;
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..496b718bd6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define	_SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#include <sys/semaphore.h>
+#include <sys/buf.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, uint64_t, boolean_t);
+#endif
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..069ec004f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZONE_H
+#define	_SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	ZFS_ZONE_IOP_READ = 0,
+	ZFS_ZONE_IOP_WRITE,
+	ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern int zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 27ebe5e659..9c718f691a 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
@@ -78,6 +79,8 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_SHA256_MAC,
+	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
@@ -430,6 +433,9 @@ struct zio {
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
+	zoneid_t	io_zoneid;	/* zone which originated this I/O */
+	hrtime_t	io_start;	/* time I/O entered zio pipeline */
+	hrtime_t	io_dispatched;	/* time I/O was dispatched to disk */
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 91a639a648..17beaea3ad 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -31,6 +31,7 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/callb.h>
+#include <sys/zfs_zone.h>
 
 /*
  * Pool-wide transaction groups.
@@ -412,6 +413,8 @@ txg_sync_thread(dsl_pool_t *dp)
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
+		zfs_zone_report_txg_sync(dp);
+
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 655728ccde..dfadeca9d4 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -21,9 +21,11 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
 #include <sys/spa_impl.h>
 #include <sys/refcount.h>
 #include <sys/vdev_disk.h>
@@ -362,8 +364,25 @@ vdev_disk_close(vdev_t *vd)
 }
 
 int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
-    uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (dvd == NULL)
+		return (EIO);
+
+	ASSERT(vd->vdev_ops == &vdev_disk_ops);
+	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+    size_t size, uint64_t offset, int flags)
 {
 	buf_t *bp;
 	int error = 0;
@@ -516,6 +535,8 @@ vdev_disk_io_start(zio_t *zio)
 	bp->b_bufsize = zio->io_size;
 	bp->b_iodone = (int (*)())vdev_disk_io_intr;
 
+	zfs_zone_zio_start(zio);
+
 	/* ldi_strategy() will return non-zero only on programming errors */
 	VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
 
@@ -527,6 +548,8 @@ vdev_disk_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
+	zfs_zone_zio_done(zio);
+
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
@@ -611,7 +634,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
-		if (vdev_disk_physio(vd_lh, (caddr_t)label,
+		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 2b06040c51..8dec283fee 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -31,6 +32,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
+#include <sys/zfs_zone.h>
 
 /*
  * These tunables are for performance analysis.
@@ -124,6 +126,8 @@ vdev_queue_init(vdev_t *vd)
 
 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	vq->vq_last_zone_id = 0;
 }
 
 void
@@ -143,6 +147,7 @@ static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_add(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_enqueue(zio);
 	avl_add(zio->io_vdev_tree, zio);
 }
 
@@ -150,6 +155,7 @@ static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	avl_remove(&vq->vq_deadline_tree, zio);
+	zfs_zone_zio_dequeue(zio);
 	avl_remove(zio->io_vdev_tree, zio);
 }
 
@@ -192,7 +198,11 @@ again:
 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
 		return (NULL);
 
+#ifdef _KERNEL
+	fio = lio = zfs_zone_schedule(vq);
+#else
 	fio = lio = avl_first(&vq->vq_deadline_tree);
+#endif
 
 	t = fio->io_vdev_tree;
 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index efae534257..49e8610542 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -22,11 +22,15 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
@@ -153,6 +157,8 @@ typedef struct raidz_map {
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
@@ -432,12 +438,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 };
 
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
-	uint64_t b = zio->io_offset >> unit_shift;
-	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t b = offset >> unit_shift;
+	uint64_t s = size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
@@ -507,7 +513,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 
-	rm->rm_col[c].rc_data = zio->io_data;
+	rm->rm_col[c].rc_data = data;
 
 	for (c = c + 1; c < acols; c++)
 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
@@ -536,7 +542,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -548,8 +554,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 			rm->rm_skipstart = 1;
 	}
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -959,12 +963,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
- *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
- *           |  0   1   0   0   0   0   0   0  |
- *  (V|I)' = |  0   0   1   0   0   0   0   0  |
- *           |  0   0   0   1   0   0   0   0  |
+ *  (V|I)' = |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
@@ -1495,6 +1496,152 @@ vdev_raidz_close(vdev_t *vd)
 		vdev_close(vd->vdev_child[c]);
 }
 
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible.  In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots.  For that reason, this function eschews parity for
+ * performance and simplicity.  The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned.  In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs.  8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ *       parity    data     data     data     data
+ *     |   PP   |   XX   |   XX   |   XX   |   XX   |
+ *         ^        ^        ^        ^        ^
+ *         |        |        |        |        |
+ *   8 KB parity    ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the layout is different:
+ *
+ *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size.  is
+ * less than 128 KB, only the actual portions of data are written.  In this
+ * example the data is written to the third data vdev since that vdev contains
+ * the offset [64 KB, 96 KB).
+ *
+ *       parity    data     data     data     data
+ *     |        |        |        |   XX   |        |
+ *                                    ^
+ *                                    |
+ *                             32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ *       parity   parity   parity    data     data     data     data
+ *     |        |        |        |        |        |   XX   |        |
+ *                                                      ^
+ *                                                      |
+ *                                               32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread)
+{
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, err = 0;
+
+	uint64_t start, end, colstart, colend;
+	uint64_t coloffset, colsize, colskip;
+
+	int flags = doread ? B_READ : B_WRITE;
+
+#ifdef	_KERNEL
+
+	/*
+	 * Don't write past the end of the block
+	 */
+	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+	start = offset;
+	end = start + size;
+
+	/*
+	 * Allocate a RAID-Z map for this block.  Note that this block starts
+	 * from the "original" offset, this is, the offset of the extent which
+	 * contains the requisite offset of the data being read or written.
+	 *
+	 * Even if this I/O operation doesn't span the full block size, let's
+	 * treat the on-disk format as if the only blocks are the complete 128
+	 * KB size.
+	 */
+	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	    SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
+	    vd->vdev_nparity);
+
+	coloffset = origoffset;
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	    c++, coloffset += rc->rc_size) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+
+		/*
+		 * Find the start and end of this column in the RAID-Z map,
+		 * keeping in mind that the stated size and offset of the
+		 * operation may not fill the entire column for this vdev.
+		 *
+		 * If any portion of the data spans this column, issue the
+		 * appropriate operation to the vdev.
+		 */
+		if (coloffset + rc->rc_size <= start)
+			continue;
+		if (coloffset >= end)
+			continue;
+
+		colstart = MAX(coloffset, start);
+		colend = MIN(end, coloffset + rc->rc_size);
+		colsize = colend - colstart;
+		colskip = colstart - coloffset;
+
+		VERIFY3U(colsize, <=, rc->rc_size);
+		VERIFY3U(colskip, <=, rc->rc_size);
+
+		/*
+		 * Note that the child vdev will have a vdev label at the start
+		 * of its range of offsets, hence the need for
+		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+		 * example of why this calculation is needed.
+		 */
+		if ((err = vdev_disk_physio(cvd,
+		    ((char *)rc->rc_data) + colskip, colsize,
+		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+		    flags)) != 0)
+			break;
+	}
+
+	vdev_raidz_map_free(rm);
+#endif	/* KERNEL */
+
+	return (err);
+}
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1530,9 +1677,13 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1663,6 +1814,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 723d516552..11120c7c4b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -23,8 +23,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -606,9 +606,11 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
+	case ZFS_PROP_DEDUP:
+	case ZFS_PROP_COMPRESSION:
 	case ZFS_PROP_ZONED:
 		/*
-		 * Disallow setting of 'zoned' from within a local zone.
+		 * Disallow setting these properties from within a local zone.
 		 */
 		if (!INGLOBALZONE(curproc))
 			return (EPERM);
@@ -1936,7 +1938,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+    boolean_t cachedpropsonly)
 {
 	int error = 0;
 	nvlist_t *nv;
@@ -1954,7 +1957,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
-		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+		    dmu_objset_type(os) == DMU_OST_ZVOL &&
+		    !cachedpropsonly) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO)
 				return (error);
@@ -1981,13 +1985,25 @@ static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
 		return (error);
 
-	error = zfs_ioc_objset_stats_impl(zc, os);
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+		return (error);
 
+	error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
 	dmu_objset_rele(os, FTAG);
 
 	return (error);
@@ -2201,8 +2217,21 @@ static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
+	nvlist_t *nvl = NULL;
+	boolean_t cachedpropsonly = B_FALSE;
 	int error;
 
+	if (zc->zc_nvlist_src != NULL &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl) != 0))
+		return (error);
+
+	if (nvl != NULL) {
+		(void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+		    &cachedpropsonly);
+		nvlist_free(nvl);
+	}
+
 top:
 	if (zc->zc_cookie == 0)
 		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
@@ -2251,8 +2280,10 @@ top:
 			objset_t *ossnap;
 
 			error = dmu_objset_from_ds(ds, &ossnap);
-			if (error == 0)
-				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			if (error == 0) {
+				error = zfs_ioc_objset_stats_impl(zc,
+				    ossnap, cachedpropsonly);
+			}
 			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
@@ -2954,6 +2985,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
 
 	ASSERT(zplprops != NULL);
 
@@ -2997,8 +3029,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
 
-	if (norm == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
 
@@ -3007,13 +3040,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
 	 */
 	if (norm)
 		u8 = 1;
-	if (u8 == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
 
-	if (sense == ZFS_PROP_UNDEFINED)
-		VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
 	VERIFY(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 3278a77041..c7bfbbaec4 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1937,6 +1938,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 
+	/*
+	 * If we're doing a forced unmount on a dataset which still has
+	 * references and is in a zone, then we need to cleanup the zone
+	 * reference at this point or else the zone will never be able to
+	 * shutdown.
+	 */
+	if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+		zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+		vfsp->vfs_zone = NULL;
+	}
+
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index c5d8ad7f45..bbbd91f46d 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -26,6 +26,10 @@
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -4146,6 +4150,8 @@ top:
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
@@ -4656,27 +4662,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	return (0);
 }
 
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * 	open()
- * 	mmap()
- * 	<modify memory>
- * 	munmap()
- * 	close()
- * 	<time lapse>
- * 	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4688,10 +4673,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
 	return (0);
 }
 
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..08f4f38e04
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+int
+zfs_zone_txg_delay()
+{
+	return (1);
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation.  Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t	zfs_zone_delay_enable = B_TRUE;	/* enable IO throttle */
+uint16_t	zfs_zone_delay_step = 5;	/* amount to change delay */
+uint16_t	zfs_zone_delay_ceiling = 100;	/* longest possible delay */
+
+hrtime_t	zfs_zone_last_checked = 0;
+
+boolean_t	zfs_zone_priority_enable = B_TRUE;  /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O.  The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down.  As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency.  If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t		zfs_zone_rw_lat_limit = 10;
+
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level.  This tunable controls the threshold
+ * at which the throttle will start delaying zones. The calculation should
+ * correspond closely with the %b column from iostat.
+ */
+uint_t		zfs_zone_util_threshold = 80;
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds.  Our system
+ * average cycle is one second or 1 million microseconds.  Our zone counter
+ * update cycle is two seconds or 2 million microseconds.  We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t 		zfs_zone_sys_avg_cycle = 1000000;	/* 1 s */
+uint_t 		zfs_zone_cycle_time = 2000000;		/* 2 s */
+
+uint_t 		zfs_zone_adjust_time = 250000;		/* 250 ms */
+
+typedef struct {
+	hrtime_t	cycle_start;
+	int		cycle_cnt;
+	hrtime_t	cycle_lat;
+	hrtime_t	sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+	hrtime_t zi_now;
+	uint_t zi_avgrlat;
+	uint_t zi_avgwlat;
+	uint64_t zi_totpri;
+	uint64_t zi_totutil;
+	int zi_active;
+	uint_t zi_diskutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t	rd_lat;
+static sys_lat_cycle_t	wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization.
+ */
+kmutex_t	zfs_disk_lock;
+uint_t		zfs_disk_rcnt;
+hrtime_t	zfs_disk_rtime = 0;
+hrtime_t	zfs_disk_rlastupdate = 0;
+
+hrtime_t	zfs_disk_last_rtime = 0;
+
+/*
+ * Data used to keep track of how often txg flush is running.
+ */
+extern int	zfs_txg_timeout;
+static uint_t	txg_last_check;
+static uint_t	txg_cnt;
+static uint_t	txg_flush_rate;
+
+boolean_t	zfs_zone_schedule_enable = B_TRUE;	/* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the
+ * number of I/Os that can be pending on a device.  If there are more than a
+ * few ops already queued up, beyond those already issued to the vdev, then
+ * use scheduling to get the next zio.
+ */
+int		zfs_zone_schedule_thresh = 5;
+
+/*
+ * Tunables for delay throttling when TxG flush is occurring.
+ */
+int		zfs_zone_txg_throttle_scale = 2;
+int		zfs_zone_txg_delay_ticks = 2;
+
+typedef struct {
+	int	zq_qdepth;
+	int	zq_priority;
+	int	zq_wt;
+	zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define	GET_USEC_TIME		(gethrtime() / 1000)
+#define	NANO_TO_MICRO(x)	(x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * If the number of ops is >1 then we can just use that value.  However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system.  We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made.  If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new historical count.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new zone count.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_cycle_time)
+		return (delta);
+
+	/* A previous cycle is past, compute the new zone count. */
+
+	/*
+	 * Figure out how many generations we have to decay the historical
+	 * count, since multiple cycles may have elapsed since our last IO.
+	 * We depend on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+	/* If more than 5 cycles since last the IO, reset count. */
+	if (gen_cnt > 5) {
+		cp->zone_avg_cnt = 0;
+	} else {
+		/* Update the count. */
+		int	i;
+
+		/*
+		 * If the zone did more than 1 IO, just use its current count
+		 * as the historical value, otherwise decay the historical
+		 * count and factor that into the new historical count.  We
+		 * pick a threshold > 1 so that we don't lose track of IO due
+		 * to int rounding.
+		 */
+		if (cp->cycle_cnt > 1)
+			cp->zone_avg_cnt = cp->cycle_cnt;
+		else
+			cp->zone_avg_cnt = cp->cycle_cnt +
+			    (cp->zone_avg_cnt / 2);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+
+	return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+		zonep->zone_rd_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+		zonep->zone_wr_ops.cycle_cnt++;
+		break;
+	case ZFS_ZONE_IOP_LOGICAL_WRITE:
+		(void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+		zonep->zone_lwr_ops.cycle_cnt++;
+		break;
+	}
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen
+ *
+ * at fixed intervals, we use a timestamp to see when the last update was made.
+ * If it was more than one cycle ago, then we need to decay the average by the
+ * proper number of additional cycles in which no IO was performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static int
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	hrtime_t delta;
+	int	gen_cnt;
+
+	/*
+	 * Check if its time to recompute a new average.
+	 * If we're still collecting data for the current cycle, return false.
+	 */
+	delta = unow - cp->cycle_start;
+	if (delta < zfs_zone_sys_avg_cycle)
+		return (0);
+
+	/* A previous cycle is past, compute a new system average. */
+
+	/*
+	 * Figure out how many generations we have to decay, since multiple
+	 * cycles may have elapsed since our last IO.
+	 * We count on int rounding here.
+	 */
+	gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+	/* If more than 5 cycles since last the IO, reset average. */
+	if (gen_cnt > 5) {
+		cp->sys_avg_lat = 0;
+	} else {
+		/* Update the average. */
+		int	i;
+
+		cp->sys_avg_lat =
+		    (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+		/*
+		 * If more than one generation has elapsed since the last
+		 * update, decay the values further.
+		 */
+		for (i = 1; i < gen_cnt; i++)
+			cp->sys_avg_lat = cp->sys_avg_lat / 2;
+	}
+
+	/* A new cycle begins. */
+	cp->cycle_start = unow;
+	cp->cycle_cnt = 0;
+	cp->cycle_lat = 0;
+
+	return (1);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+	switch (op) {
+	case ZFS_ZONE_IOP_READ:
+		(void) compute_new_sys_avg(unow, &rd_lat);
+		rd_lat.cycle_cnt++;
+		rd_lat.cycle_lat += lat;
+		break;
+	case ZFS_ZONE_IOP_WRITE:
+		(void) compute_new_sys_avg(unow, &wr_lat);
+		wr_lat.cycle_cnt++;
+		wr_lat.cycle_lat += lat;
+		break;
+	}
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+	hrtime_t delta;
+	uint_t cnt;
+
+	if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		cnt = cp->zone_avg_cnt;
+	} else {
+		/*
+		 * If we're less than half way through the cycle then use
+		 * the current count plus half the historical count, otherwise
+		 * just use the current count.
+		 */
+		if (delta < (zfs_zone_cycle_time / 2))
+			cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+		else
+			cnt = cp->cycle_cnt;
+	}
+
+	return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+	if (compute_new_sys_avg(unow, cp)) {
+		/*
+		 * No activity in the current cycle, we already have the
+		 * historical data so we'll use that.
+		 */
+		return (cp->sys_avg_lat);
+	} else {
+		/*
+		 * We're within a cycle; weight the current activity higher
+		 * compared to the historical data and use that.
+		 */
+		extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
+		    uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__calc__wt__avg(
+		    (uintptr_t)cp->sys_avg_lat,
+		    (uintptr_t)cp->cycle_lat,
+		    (uintptr_t)cp->cycle_cnt);
+
+		return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+		    (1 + (cp->cycle_cnt * 8)));
+	}
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+	/* Add op to zone */
+	add_zone_iop(zonep, unow, op);
+
+	/* Track system latency */
+	if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+		add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone.  If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+    uint_t *lwops)
+{
+	*rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+	*wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+	*lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+	extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
+	    (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+
+	return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+	*rlat = calc_avg_lat(unow, &rd_lat);
+	*wlat = calc_avg_lat(unow, &wr_lat);
+
+	/*
+	 * In an attempt to improve the accuracy of the throttling algorithm,
+	 * assume that IO operations can't have zero latency.  Instead, assume
+	 * a reasonable lower bound for each operation type. If the actual
+	 * observed latencies are non-zero, use those latency values instead.
+	 */
+	if (*rlat == 0)
+		*rlat = 1000;
+	if (*wlat == 0)
+		*wlat = 1000;
+
+	extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
+	    (uintptr_t)*wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint_t rops, wops, lwops;
+
+	if (zonep->zone_id == GLOBAL_ZONEID ||
+	    get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+		zonep->zone_io_util = 0;
+		return (0);
+	}
+
+	zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+	    (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+	sp->zi_totutil += zonep->zone_io_util;
+
+	if (zonep->zone_io_util > 0) {
+		sp->zi_active++;
+		sp->zi_totpri += zonep->zone_zfs_io_pri;
+	}
+
+	/*
+	 * sdt:::zfs-zone-utilization
+	 *
+	 *	arg0: zone ID
+	 *	arg1: read operations observed during time window
+	 *	arg2: physical write operations observed during time window
+	 *	arg3: logical write ops observed during time window
+	 *	arg4: calculated utilization given read and write ops
+	 *	arg5: I/O priority assigned to this zone
+	 */
+	extern void __dtrace_probe_zfs__zone__utilization(
+	    uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
+
+	__dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
+	    (uint_t)rops, (uint_t)wops, (uint_t)lwops,
+	    (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+
+	return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+	if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+		zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+	if (zonep->zone_io_delay > 0)
+		zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay.  Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+	zoneio_stats_t *sp = arg;
+	uint16_t delay = zonep->zone_io_delay;
+	uint_t fairutil = 0;
+
+	zonep->zone_io_util_above_avg = B_FALSE;
+
+	/*
+	 * Given the calculated total utilitzation for all zones, calculate the
+	 * fair share of I/O for this zone.
+	 */
+	if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+		fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+		    sp->zi_totpri;
+	} else if (sp->zi_active > 0) {
+		fairutil = sp->zi_totutil / sp->zi_active;
+	}
+
+	/*
+	 * Adjust each IO's delay.  If the overall delay becomes too high, avoid
+	 * increasing beyond the ceiling value.
+	 */
+	if (zonep->zone_io_util > fairutil &&
+	    sp->zi_diskutil > zfs_zone_util_threshold) {
+		zonep->zone_io_util_above_avg = B_TRUE;
+
+		if (sp->zi_active > 1)
+			zfs_zone_delay_inc(zonep);
+	} else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+		zfs_zone_delay_dec(zonep);
+	}
+
+	/*
+	 * sdt:::zfs-zone-throttle
+	 *
+	 *	arg0: zone ID
+	 *	arg1: old delay for this zone
+	 *	arg2: new delay for this zone
+	 *	arg3: calculated fair I/O utilization
+	 *	arg4: actual I/O utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__throttle(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__throttle(
+	    (uintptr_t)zonep->zone_id, (uintptr_t)delay,
+	    (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
+	    (uintptr_t)zonep->zone_io_util);
+
+	return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow)
+{
+	zoneio_stats_t stats;
+
+	(void) bzero(&stats, sizeof (stats));
+
+	stats.zi_now = unow;
+	get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+	if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+		stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+	else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+		stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+	if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+		return;
+
+	/*
+	 * Calculate disk utilization for the most recent period.
+	 */
+	if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+		stats.zi_diskutil = 0;
+	} else {
+		stats.zi_diskutil =
+		    ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+		    ((unow - zfs_zone_last_checked) * 1000);
+	}
+	zfs_disk_last_rtime = zfs_disk_rtime;
+
+	/*
+	 * sdt:::zfs-zone-stats
+	 *
+	 * Statistics observed over the last period:
+	 *
+	 *	arg0: average system read latency
+	 *	arg1: average system write latency
+	 *	arg2: number of active zones
+	 *	arg3: total I/O 'utilization' for all zones
+	 *	arg4: total I/O priority of all active zones
+	 *	arg5: calculated disk utilization
+	 */
+	extern void __dtrace_probe_zfs__zone__stats(
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
+	    (uintptr_t)(stats.zi_avgwlat),
+	    (uintptr_t)(stats.zi_active),
+	    (uintptr_t)(stats.zi_totutil),
+	    (uintptr_t)(stats.zi_totpri),
+	    (uintptr_t)(stats.zi_diskutil));
+
+	(void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue.  Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+	int pri;
+	zone_q_bump_t *qbp = arg;
+
+	extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t);
+	__dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)(zonep->zone_zfs_queued));
+
+	if (zonep->zone_zfs_queued == 0) {
+		zonep->zone_zfs_weight = 0;
+		return (0);
+	}
+
+	/*
+	 * On each pass, increment the zone's weight.  We use this as input
+	 * to the calculation to prevent starvation.  The value is reset
+	 * each time we issue an IO for this zone so zones which haven't
+	 * done any IO over several iterations will see their weight max
+	 * out.
+	 */
+	if (zonep->zone_zfs_weight < 20)
+		zonep->zone_zfs_weight++;
+
+	/*
+	 * This zone's IO priority is the inverse of the number of IOs
+	 * the zone has enqueued * zone's configured priority * weight.
+	 * The queue depth has already been scaled by 10 to avoid problems
+	 * with int rounding.
+	 *
+	 * This means that zones with fewer IOs in the queue will get
+	 * preference unless other zone's assigned priority pulls them
+	 * ahead.  The weight is factored in to help ensure that zones
+	 * which haven't done IO in a while aren't getting starved.
+	 */
+	pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) *
+	    zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+	/*
+	 * If this zone has a higher priority than what we found so far,
+	 * schedule it next.
+	 */
+	if (pri > qbp->zq_priority) {
+		qbp->zq_zoneid = zonep->zone_id;
+		qbp->zq_priority = pri;
+		qbp->zq_wt = zonep->zone_zfs_weight;
+	}
+	return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue.
+ *
+ * For single-threaded synchronous workloads a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple workloads
+ * in parallel.  This can cause an imbalance in performance if there are zones
+ * with many parallel workloads (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded workloads, such as interactive tasks in the
+ * shell.  These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result.  This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority.  We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_t *vq, int qdepth)
+{
+	zone_q_bump_t qbump;
+	zio_t *zp = NULL, *zphead;
+	int cnt = 0;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	/* To avoid problems with int rounding, scale the queue depth by 10 */
+	qbump.zq_qdepth = qdepth * 10;
+	qbump.zq_priority = 0;
+	qbump.zq_zoneid = 0;
+	(void) zone_walk(get_sched_pri_cb, &qbump);
+
+	zphead = avl_first(&vq->vq_deadline_tree);
+
+	/* Check if the scheduler didn't pick a zone for some reason!? */
+	if (qbump.zq_zoneid != 0) {
+		for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL;
+		    zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) {
+			if (zp->io_zoneid == qbump.zq_zoneid)
+				break;
+			cnt++;
+		}
+	}
+
+	if (zp == NULL) {
+		zp = zphead;
+	} else if (zp != zphead) {
+		/*
+		 * Only fire the probe if we actually picked a different zio
+		 * than the one already at the head of the queue.
+		 */
+		extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t,
+		    uintptr_t, uintptr_t, uintptr_t);
+		__dtrace_probe_zfs__zone__sched__bump(
+		    (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt),
+		    (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt));
+	}
+
+	return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out RX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+	zone_t	*zonep = curzone;
+
+	zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track IO operations per zone.  Called from dmu_tx_count_write for write ops
+ * and dmu_read_uio for read ops.  For each operation, increment that zone's
+ * counter based on the type of operation.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls.  Those ops go into a TXG which
+ *    we'll count here.  Sometime later a kernel taskq thread (we'll see the
+ *    vdev IO as zone 0) will perform some number of physical writes to commit
+ *    the TXG to disk.  Those writes are not associated with the zone which
+ *    made the write syscalls and the number of operations is not correlated
+ *    between the taskq and the zone.
+ * 2) An application opens a file with O_SYNC.  Each write will result in
+ *    an operation which we'll see here plus a low-level vdev write from
+ *    that zone.
+ * 3) An application does write syscalls followed by an fsync().  We'll
+ *    count the writes going into a TXG here.  We'll also see some number
+ *    (usually much smaller, maybe only 1) of low-level vdev writes from this
+ *    zone when the fsync is performed, plus some other low-level vdev writes
+ *    from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ *    writing out dirty pages to swap, or sync(2) calls, which will be handled
+ *    by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice because this is called
+ * at a high level by a zone thread, but we also will count the phys. writes
+ * that are performed at a low level via zfs_zone_zio_start.
+ *
+ * Without this, it can look like a non-global zone never writes (case 1).
+ * Depending on when the TXG is flushed, the counts may be in the same sample
+ * bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics.  The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through dmu_read_uio.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+	zone_t *zonep = curzone;
+	hrtime_t unow;
+	uint16_t wait;
+
+	unow = GET_USEC_TIME;
+
+	/*
+	 * Only bump the counters for logical operations here.  The counters for
+	 * tracking physical IO operations are handled in zfs_zone_zio_done.
+	 */
+	if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, type, 0);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	if (!zfs_zone_delay_enable)
+		return;
+
+	/*
+	 * XXX There's a potential race here in that more than one thread may
+	 * update the zone delays concurrently.  The worst outcome is corruption
+	 * of our data to track each zone's IO, so the algorithm may make
+	 * incorrect throttling decisions until the data is refreshed.
+	 */
+	if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+		zfs_zone_wait_adjust(unow);
+		zfs_zone_last_checked = unow;
+	}
+
+	if ((wait = zonep->zone_io_delay) > 0) {
+		/*
+		 * If this is a write and we're doing above normal TxG
+		 * flushing, then throttle for longer than normal.
+		 */
+		if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+		    (txg_cnt > 1 || txg_flush_rate > 1))
+			wait *= zfs_zone_txg_throttle_scale;
+
+		/*
+		 * sdt:::zfs-zone-wait
+		 *
+		 *	arg0: zone ID
+		 *	arg1: type of IO operation
+		 *	arg2: time to delay (in us)
+		 */
+		extern void __dtrace_probe_zfs__zone__wait(
+		    uintptr_t, uintptr_t, uintptr_t);
+
+		__dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
+		    (uintptr_t)type, (uintptr_t)wait);
+
+		drv_usecwait(wait);
+
+		if (zonep->zone_vfs_stats != NULL) {
+			atomic_inc_64(&zonep->zone_vfs_stats->
+			    zv_delay_cnt.value.ui64);
+			atomic_add_64(&zonep->zone_vfs_stats->
+			    zv_delay_time.value.ui64, wait);
+		}
+	}
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TxG flush rate is running above the expected rate.
+ * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * write workload.  We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load.  In this case, the flush rate is going to be 1.  When there
+ * is a heavy write load, TxG's fill up fast and the sync thread will write
+ * the TxG more frequently (perhaps once a second).  In this case the rate
+ * will be > 1.  The flush rate is a lagging indicator since it can be up
+ * to 5 seconds old.  We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_flush_rate to keep track of the previous
+ * 5 second interval.  In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+	uint_t now;
+
+	txg_cnt++;
+	now = (uint_t)(gethrtime() / NANOSEC);
+	if ((now - txg_last_check) >= zfs_txg_timeout) {
+		txg_flush_rate = txg_cnt / 2;
+		txg_cnt = 0;
+		txg_last_check = now;
+	}
+}
+
+int
+zfs_zone_txg_delay()
+{
+	zone_t	*zonep = curzone;
+	int delay = 1;
+
+	if (zonep->zone_io_util_above_avg)
+		delay = zfs_zone_txg_delay_ticks;
+
+	extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id),
+	    (uintptr_t)delay);
+
+	return (delay);
+}
+
+/*
+ * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	/*
+	 * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+	 * an actual I/O operation.  Ignore those operations as they relate to
+	 * throttling and scheduling.
+	 */
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_zfs_lock);
+	if (zp->io_type == ZIO_TYPE_READ)
+		kstat_runq_enter(&zonep->zone_zfs_rwstats);
+	zonep->zone_zfs_weight = 0;
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zp->io_dispatched = gethrtime();
+
+	if (zfs_disk_rcnt++ != 0)
+		zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = zp->io_dispatched;
+	mutex_exit(&zfs_disk_lock);
+
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+	zone_t	*zonep;
+	hrtime_t now, unow, udelta;
+
+	if (zp->io_type == ZIO_TYPE_IOCTL)
+		return;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	now = gethrtime();
+	unow = NANO_TO_MICRO(now);
+	udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+	mutex_enter(&zonep->zone_zfs_lock);
+
+	/*
+	 * To calculate the wsvc_t average, keep a cumulative sum of all the
+	 * wait time before each I/O was dispatched.  Since most writes are
+	 * asynchronous, only track the wait time for read I/Os.
+	 */
+	if (zp->io_type == ZIO_TYPE_READ) {
+		zonep->zone_zfs_rwstats.reads++;
+		zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+		zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+		    zp->io_dispatched - zp->io_start;
+
+		kstat_runq_exit(&zonep->zone_zfs_rwstats);
+	} else {
+		zonep->zone_zfs_rwstats.writes++;
+		zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+	}
+
+	mutex_exit(&zonep->zone_zfs_lock);
+
+	mutex_enter(&zfs_disk_lock);
+	zfs_disk_rcnt--;
+	zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+	zfs_disk_rlastupdate = now;
+	mutex_exit(&zfs_disk_lock);
+
+	if (zfs_zone_delay_enable) {
+		mutex_enter(&zonep->zone_stg_io_lock);
+		add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+		    ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+		mutex_exit(&zonep->zone_stg_io_lock);
+	}
+
+	zone_rele(zonep);
+
+	/*
+	 * sdt:::zfs-zone-latency
+	 *
+	 *	arg0: zone ID
+	 *	arg1: type of I/O operation
+	 *	arg2: I/O latency (in us)
+	 */
+	extern void __dtrace_probe_zfs__zone__latency(
+	    uintptr_t, uintptr_t, uintptr_t);
+
+	__dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
+	    (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	ASSERT(zonep->zone_zfs_queued > 0);
+	if (zonep->zone_zfs_queued == 0)
+		cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+	else
+		zonep->zone_zfs_queued--;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+	zone_t	*zonep;
+
+	if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+		return;
+
+	mutex_enter(&zonep->zone_stg_io_lock);
+	zonep->zone_zfs_queued++;
+	mutex_exit(&zonep->zone_stg_io_lock);
+	zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue.  This function is where zio's are found
+ * at the head of the queue (by avl_first), then pulled off (by
+ * vdev_queue_io_remove) and issued.  We do our scheduling here to find the
+ * next zio to issue.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq)
+{
+	int cnt;
+	zoneid_t last_zone;
+	zio_t *zp;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	cnt = avl_numnodes(&vq->vq_deadline_tree);
+	last_zone = vq->vq_last_zone_id;
+
+	/*
+	 * If there are only a few ops in the queue then just issue the head.
+	 * If there are more than a few ops already queued up, then use
+	 * scheduling to get the next zio.
+	 */
+	if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+		zp = avl_first(&vq->vq_deadline_tree);
+	else
+		zp = get_next_zio(vq, cnt);
+
+	vq->vq_last_zone_id = zp->io_zoneid;
+
+	/*
+	 * Probe with 3 args; the number of IOs in the queue, the zone that
+	 * was last scheduled off this queue, and the zone that was associated
+	 * with the next IO that is scheduled.
+	 */
+	extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t,
+	    uintptr_t);
+
+	__dtrace_probe_zfs__zone__sched((uintptr_t)(cnt),
+	    (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid));
+
+	return (zp);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index d1bed63f30..00964aa83f 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -36,6 +36,7 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/zfs_zone.h>
 
 /*
  * ==========================================================================
@@ -511,6 +512,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
+	zio->io_start = gethrtime();
+
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
@@ -562,11 +565,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
+		zio->io_zoneid = pio->io_zoneid;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
+	} else {
+		zfs_zone_zio_init(zio);
 	}
 
 	return (zio);
@@ -904,6 +910,8 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
+	zio->io_start = gethrtime();
+
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2289,6 +2297,9 @@ zio_vdev_io_start(zio_t *zio)
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		zio->io_start = gethrtime();
+
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index c8fe20f2eb..7af4644cbf 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
+/*
+ * The sha256_mac checksum algorithm was added to try to maintain on-disk
+ * compatibility with ZFS on other platforms.  That effort didn't work for other
+ * reasons.  As a result, the sha256_mac algorithm is unused except in the rare
+ * case of an older platform interpreting noparity as sha256_mac -- which is why
+ * they both are no-ops.
+ */
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL,			NULL},			0, 0, 0, "inherit"},
 	{{NULL,			NULL},			0, 0, 0, "on"},
@@ -77,6 +85,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "sha256_mac"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "noparity"},
 };
 
 enum zio_checksum
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index fa26629c6e..ef96b1c401 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -77,9 +77,11 @@
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/sdt.h>
 
 #include "zfs_namecheck.h"
 
@@ -1070,27 +1072,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 }
 
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-		    vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		int err = zvol_dumpio_vdev(vd->vdev_child[c],
-		    addr, offset, size, doread, isdump);
-		if (err != 0) {
-			numerrors++;
-		} else if (doread) {
-			break;
+	if (vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			int err = zvol_dumpio_vdev(vd->vdev_child[c],
+			    addr, offset, origoffset, size, doread, isdump);
+			if (err != 0) {
+				numerrors++;
+			} else if (doread) {
+				break;
+			}
 		}
 	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
@@ -1098,19 +1101,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
 	else if (!doread && !vdev_writeable(vd))
 		return (EIO);
 
-	dvd = vd->vdev_tsd;
-	ASSERT3P(dvd, !=, NULL);
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		return (vdev_raidz_physio(vd,
+		    addr, size, offset, origoffset, doread));
+	}
+
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
 		ASSERT(!doread);
 		if (doread)
 			return (EIO);
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
-		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-		    doread ? B_READ : B_WRITE));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
+
+		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
@@ -1142,7 +1153,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+	    size, doread, isdump);
 
 	if (!ddi_in_panic())
 		spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1333,6 +1345,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1351,6 +1365,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 	}
 	zfs_range_unlock(rl);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+	    error);
+
 	return (error);
 }
 
@@ -1380,6 +1398,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
 	sync = !(zv->zv_flags & ZVOL_WCE) ||
 	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
@@ -1410,6 +1430,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	zfs_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+	    error);
+
 	return (error);
 }
 
@@ -1863,7 +1887,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    ZIO_CHECKSUM_OFF) == 0);
+		    ZIO_CHECKSUM_NOPARITY) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 42adb4c451..bd50364310 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 1990 Mentat Inc.
  */
 
@@ -2195,6 +2196,8 @@ struct ip_xmit_attr_s {
 	 */
 	ixa_notify_t	ixa_notify;	/* Registered upcall notify function */
 	void		*ixa_notify_cookie; /* ULP cookie for ixa_notify */
+
+	uint_t		ixa_tcpcleanup;	/* Used by conn_ixa_cleanup */
 };
 
 /*
@@ -2266,6 +2269,14 @@ struct ip_xmit_attr_s {
 #define	IXA_FREE_TSL		0x00000002	/* ixa_tsl needs to be rele */
 
 /*
+ * Trivial state machine used to synchronize IXA cleanup for TCP connections.
+ * See conn_ixa_cleanup().
+ */
+#define	IXATC_IDLE		0x00000000
+#define	IXATC_INPROGRESS	0x00000001
+#define	IXATC_COMPLETE		0x00000002
+
+/*
  * Simplistic way to set the ixa_xmit_hint for locally generated traffic
  * and forwarded traffic. The shift amount are based on the size of the
  * structs to discard the low order bits which don't have much if any variation
@@ -3030,6 +3041,7 @@ extern vmem_t *ip_minor_arena_la;
 #define	ips_ip_strict_src_multihoming	ips_propinfo_tbl[80].prop_cur_uval
 #define	ips_ipv6_strict_src_multihoming	ips_propinfo_tbl[81].prop_cur_uval
 #define	ips_ipv6_drop_inbound_icmpv6	ips_propinfo_tbl[82].prop_cur_bval
+#define	ips_ip_dce_reclaim_threshold	ips_propinfo_tbl[83].prop_cur_uval
 
 extern int	dohwcksum;	/* use h/w cksum if supported by the h/w */
 #ifdef ZC_TEST
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 3197858f8e..e040af14ba 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 	 */
 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
 		crhold(ixa->ixa_cred);
+
+	/*
+	 * There is no cleanup in progress on this new copy.
+	 */
+	ixa->ixa_tcpcleanup = IXATC_IDLE;
 }
 
 /*
@@ -1176,6 +1181,59 @@ ixa_cleanup_stale(ip_xmit_attr_t *ixa)
 	}
 }
 
+static mblk_t *
+tcp_ixa_cleanup_getmblk(conn_t *connp)
+{
+	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+	int need_retry;
+	mblk_t *mp;
+
+	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+	/*
+	 * It's possible that someone else came in and started cleaning up
+	 * another connection between the time we verified this one is not being
+	 * cleaned up and the time we actually get the shared mblk.  If that's
+	 * the case, we've dropped the lock, and some other thread may have
+	 * cleaned up this connection again, and is still waiting for
+	 * notification of that cleanup's completion.  Therefore we need to
+	 * recheck.
+	 */
+	do {
+		need_retry = 0;
+		while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
+			cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+			    &tcps->tcps_ixa_cleanup_lock);
+		}
+
+		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
+			/*
+			 * Multiple concurrent cleanups; need to have the last
+			 * one run since it could be an unplumb.
+			 */
+			need_retry = 1;
+			cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
+			    &tcps->tcps_ixa_cleanup_lock);
+		}
+	} while (need_retry);
+
+	/*
+	 * We now have the lock and the mblk; now make sure that no one else can
+	 * try to clean up this connection or enqueue it for cleanup, clear the
+	 * mblk pointer for this stack, drop the lock, and return the mblk.
+	 */
+	ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
+	ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
+	ASSERT(mp != NULL);
+
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
+	tcps->tcps_ixa_cleanup_mp = NULL;
+	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+
+	return (mp);
+}
+
 /*
  * Used to run ixa_cleanup_stale inside the tcp squeue.
  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
@@ -1195,11 +1253,39 @@ tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
 
 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
 	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
 	tcps->tcps_ixa_cleanup_mp = mp;
-	cv_signal(&tcps->tcps_ixa_cleanup_cv);
+	cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
+	/*
+	 * It is possible for any number of threads to be waiting for cleanup of
+	 * different connections.  Absent a per-connection (or per-IXA) CV, we
+	 * need to wake them all up even though only one can be waiting on this
+	 * particular cleanup.
+	 */
+	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
 }
 
+static void
+tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
+{
+	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+
+	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
+
+	while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
+		cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+		    &tcps->tcps_ixa_cleanup_lock);
+	}
+
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
+	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
+
+	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+}
 
 /*
  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
@@ -1214,21 +1300,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
 
 	if (IPCL_IS_TCP(connp)) {
 		mblk_t		*mp;
-		tcp_stack_t	*tcps;
-
-		tcps = connp->conn_netstack->netstack_tcp;
 
-		mutex_enter(&tcps->tcps_ixa_cleanup_lock);
-		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
-			/*
-			 * Multiple concurrent cleanups; need to have the last
-			 * one run since it could be an unplumb.
-			 */
-			cv_wait(&tcps->tcps_ixa_cleanup_cv,
-			    &tcps->tcps_ixa_cleanup_lock);
-		}
-		tcps->tcps_ixa_cleanup_mp = NULL;
-		mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+		mp = tcp_ixa_cleanup_getmblk(connp);
 
 		if (connp->conn_sqp->sq_run == curthread) {
 			/* Already on squeue */
@@ -1237,15 +1310,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
 			CONN_INC_REF(connp);
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
 			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
-
-			/* Wait until tcp_ixa_cleanup has run */
-			mutex_enter(&tcps->tcps_ixa_cleanup_lock);
-			while (tcps->tcps_ixa_cleanup_mp == NULL) {
-				cv_wait(&tcps->tcps_ixa_cleanup_cv,
-				    &tcps->tcps_ixa_cleanup_lock);
-			}
-			mutex_exit(&tcps->tcps_ixa_cleanup_lock);
 		}
+		tcp_ixa_cleanup_wait_and_finish(connp);
 	} else if (IPCL_IS_SCTP(connp)) {
 		sctp_t	*sctp;
 		sctp_faddr_t *fp;
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
index 215bc4675f..502ee8a735 100644
--- a/usr/src/uts/common/inet/ip/ip_dce.c
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -32,6 +33,7 @@
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/atomic.h>
+#include <sys/callb.h>
 #define	_SUN_TPI_VERSION 2
 #include <sys/tihdr.h>
 
@@ -102,7 +104,19 @@ static void	dce_delete_locked(dcb_t *, dce_t *);
 static void	dce_make_condemned(dce_t *);
 
 static kmem_cache_t *dce_cache;
+static kthread_t *dce_reclaim_thread;
+static kmutex_t dce_reclaim_lock;
+static kcondvar_t dce_reclaim_cv;
+static int dce_reclaim_shutdown;
 
+/* Global so it can be tuned in /etc/system. This must be a power of two. */
+uint_t ip_dce_hash_size = 1024;
+
+/* The time in seconds between executions of the IP DCE reclaim worker. */
+uint_t ip_dce_reclaim_interval = 60;
+
+/* The factor of the DCE threshold at which to start hard reclaims */
+uint_t ip_dce_reclaim_threshold_hard = 2;
 
 /* Operates on a uint64_t */
 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
@@ -117,6 +131,11 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 	uint_t	fraction_pmtu = fraction*4;
 	uint_t	hash;
 	dce_t	*dce, *nextdce;
+	hrtime_t seed = gethrtime();
+	uint_t	retained = 0;
+	uint_t	max = ipst->ips_ip_dce_reclaim_threshold;
+
+	max *= ip_dce_reclaim_threshold_hard;
 
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
@@ -132,13 +151,21 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 		} else {
 			mutex_exit(&dce->dce_lock);
 		}
-		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
-		if (dce->dce_flags & DCEF_PMTU) {
-			if (hash % fraction_pmtu != 0)
-				continue;
-		} else {
-			if (hash % fraction != 0)
-				continue;
+
+		if (max == 0 || retained < max) {
+			hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
+
+			if (dce->dce_flags & DCEF_PMTU) {
+				if (hash % fraction_pmtu != 0) {
+					retained++;
+					continue;
+				}
+			} else {
+				if (hash % fraction != 0) {
+					retained++;
+					continue;
+				}
+			}
 		}
 
 		IP_STAT(ipst, ip_dce_reclaim_deleted);
@@ -175,17 +202,19 @@ ip_dce_reclaim_stack(ip_stack_t *ipst)
 }
 
 /*
- * Called by the memory allocator subsystem directly, when the system
- * is running low on memory.
+ * Called by dce_reclaim_worker() below, and no one else.  Typically this will
+ * mean that the number of entries in the hash buckets has exceeded a tunable
+ * threshold.
  */
-/* ARGSUSED */
-void
-ip_dce_reclaim(void *args)
+static void
+ip_dce_reclaim(void)
 {
 	netstack_handle_t nh;
 	netstack_t *ns;
 	ip_stack_t *ipst;
 
+	ASSERT(curthread == dce_reclaim_thread);
+
 	netstack_next_init(&nh);
 	while ((ns = netstack_next(&nh)) != NULL) {
 		/*
@@ -196,26 +225,75 @@ ip_dce_reclaim(void *args)
 			netstack_rele(ns);
 			continue;
 		}
-		ip_dce_reclaim_stack(ipst);
+		if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
+			ip_dce_reclaim_stack(ipst);
 		netstack_rele(ns);
 	}
 	netstack_next_fini(&nh);
 }
 
+/* ARGSUSED */
+static void
+dce_reclaim_worker(void *arg)
+{
+	callb_cpr_t	cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
+	    "dce_reclaim_worker");
+
+	mutex_enter(&dce_reclaim_lock);
+	while (!dce_reclaim_shutdown) {
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		(void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
+		    ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
+		CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
+
+		if (dce_reclaim_shutdown)
+			break;
+
+		mutex_exit(&dce_reclaim_lock);
+		ip_dce_reclaim();
+		mutex_enter(&dce_reclaim_lock);
+	}
+
+	ASSERT(MUTEX_HELD(&dce_reclaim_lock));
+	dce_reclaim_thread = NULL;
+	dce_reclaim_shutdown = 0;
+	cv_broadcast(&dce_reclaim_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops the lock */
+
+	thread_exit();
+}
+
 void
 dce_g_init(void)
 {
 	dce_cache = kmem_cache_create("dce_cache",
-	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+	    sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
+
+	dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
+	    NULL, 0, &p0, TS_RUN, minclsyspri);
 }
 
 void
 dce_g_destroy(void)
 {
+	mutex_enter(&dce_reclaim_lock);
+	dce_reclaim_shutdown = 1;
+	cv_signal(&dce_reclaim_cv);
+	while (dce_reclaim_thread != NULL)
+		cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
+	mutex_exit(&dce_reclaim_lock);
+
+	cv_destroy(&dce_reclaim_cv);
+	mutex_destroy(&dce_reclaim_lock);
+
 	kmem_cache_destroy(dce_cache);
 }
 
-
 /*
  * Allocate a default DCE and a hash table for per-IP address DCEs
  */
@@ -234,7 +312,7 @@ dce_stack_init(ip_stack_t *ipst)
 	ipst->ips_dce_default->dce_ipst = ipst;
 
 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
-	ipst->ips_dce_hashsize = 256;
+	ipst->ips_dce_hashsize = ip_dce_hash_size;
 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
 	    sizeof (dcb_t), KM_SLEEP);
 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
@@ -414,6 +492,12 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 
 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 	dcb = &ipst->ips_dce_hash_v4[hash];
+	/*
+	 * Assuming that we get fairly even distribution across all of the
+	 * buckets, once one bucket is overly full, prune the whole cache.
+	 */
+	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 		if (dce->dce_v4addr == dst) {
@@ -447,6 +531,7 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 	dce->dce_ptpn = &dcb->dcb_dce;
 	dcb->dcb_dce = dce;
 	dce->dce_bucket = dcb;
+	atomic_add_32(&dcb->dcb_cnt, 1);
 	dce_refhold(dce);	/* For the caller */
 	rw_exit(&dcb->dcb_lock);
 
@@ -476,6 +561,12 @@ dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
 
 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 	dcb = &ipst->ips_dce_hash_v6[hash];
+	/*
+	 * Assuming that we get fairly even distribution across all of the
+	 * buckets, once one bucket is overly full, prune the whole cache.
+	 */
+	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c
index 516d6c1a21..1e249b493e 100644
--- a/usr/src/uts/common/inet/ip/ip_tunables.c
+++ b/usr/src/uts/common/inet/ip/ip_tunables.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -908,6 +909,11 @@ mod_prop_info_t ip_propinfo_tbl[] = {
 #else
 	{ "", 0, NULL, NULL, {0}, {0} },
 #endif
+
+	{ "_dce_reclaim_threshold", MOD_PROTO_IP,
+	    mod_set_uint32, mod_get_uint32,
+	    {1, 100000, 32}, {32} },
+
 	{ "mtu", MOD_PROTO_IPV4, NULL, ip_get_mtu, {0}, {0} },
 
 	{ "mtu", MOD_PROTO_IPV6, NULL, ip_get_mtu, {0}, {0} },
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 47972a8c1a..96a0457678 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 {
 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 	    "net", KSTAT_TYPE_NAMED,
-	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_PERSISTENT, stackid);
+	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
 
 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 		return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index a564376cfb..706752b236 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -269,6 +269,7 @@ struct ip_stack {
 	uint_t		ips_dce_hashsize;
 	struct dcb_s	*ips_dce_hash_v4;
 	struct dcb_s	*ips_dce_hash_v6;
+	uint_t		ips_dce_reclaim_needed;
 
 	/* pending binds */
 	mblk_t		*ips_ip6_asp_pending_ops;
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
new file mode 100644
index 0000000000..5670e5afaa
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -0,0 +1,1192 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * ipd: Internet packet disturber
+ *
+ * The purpose of ipd is to simulate congested and lossy networks when they
+ * don't actually exist. The features of these congested and lossy networks are
+ * events that end up leading to retransmits and thus kicking us out of the
+ * TCP/IP fastpath. Since normally this would require us to have an actually
+ * congested network, which can be problematic, we instead simulate this
+ * behavior.
+ *
+ * 1. ipd's operations and restrictions
+ *
+ * ipd currently has facilities to cause IP traffic to be:
+ *
+ *   - Corrupted with some probability.
+ *   - Delayed for a set number of microseconds.
+ *   - Dropped with some probability.
+ *
+ * Each of these features are enabled on a per-zone basic. The current
+ * implementation restricts this specifically to exclusive stack zones.
+ * Enabling ipd on a given zone causes pfhooks to be installed for that zone's
+ * netstack. Because of the nature of ipd, it currently only supports exclusive
+ * stack zones and as a further restriction, it only allows the global zone
+ * administrative access. ipd can be enabled for the global zone, but doing so
+ * will cause all shared-stack zones to also be affected.
+ *
+ * 2. General architecture and Locking
+ *
+ * ipd consists of a few components. There is a per netstack data structure that
+ * is created and destroyed with the creation and destruction of each exclusive
+ * stack zone. Each of these netstacks is stored in a global list which is
+ * accessed for control of ipd via ioctls. The following diagram touches on the
+ * data structures that are used throughout ipd.
+ *
+ *   ADMINISTRATIVE			         DATA PATH
+ *
+ *    +--------+                          +------+       +------+
+ *    | ipdadm |                          |  ip  |       | nics |
+ *    +--------+                          +------+       +------+
+ *       |  ^                                |               |
+ *       |  | ioctl(2)                       |               |
+ *       V  |                                V               V
+ *    +----------+                     +-------------------------+
+ *    | /dev/ipd |                     | pfhooks packet callback | == ipd_hook()
+ *    +----------+                     +-------------------------+
+ *         |                                         |
+ *         |                                         |
+ *         V                                         |
+ *    +----------------+                             |
+ *    | list_t ipd_nsl |------+                      |
+ *    +----------------+      |                      |
+ *                            |                      |
+ *                            V     per netstack     V
+ *                         +----------------------------+
+ *                         |       ipd_nestack_t        |
+ *                         +----------------------------+
+ *
+ * ipd has two different entry points, one is administrative, the other is the
+ * data path. The administrative path is accessed by a userland component called
+ * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd.
+ * If the administrative path enables a specific zone, then the data path will
+ * become active for that zone. Any packet that leaves that zone's IP stack or
+ * is going to enter it, comes through the callback specified in the hook_t(9S)
+ * structure. This will cause each packet to go through ipd_hook().
+ *
+ * While the locking inside of ipd should be straightforward, unfortunately, the
+ * pfhooks subsystem necessarily complicates this a little bit. There are
+ * currently three different sets of locks in ipd.
+ *
+ *   - Global lock N on the netstack list.
+ *   - Global lock A on the active count.
+ *   - Per-netstack data structure lock Z.
+ *
+ * # Locking rules
+ *
+ * L.1a N must always be acquired first and released last
+ *
+ * If you need to acquire the netstack list lock, either for reading or writing,
+ * then N must be acquired first and before any other locks. It may not be
+ * dropped before any other lock.
+ *
+ * L.1b N must only be acquired from the administrative path and zone creation,
+ *      shutdown, and destruct callbacks.
+ *
+ * The data path, e.g. receiving the per-packet callbacks, should never be
+ * grabbing the list lock. If it is, then the architecture here needs to be
+ * reconsidered.
+ *
+ * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks
+ *     are active.
+ *
+ * The way the pfhooks subsystem is designed is that a reference count is
+ * present on the hook_t while it is active. As long as that reference count is
+ * non-zero, a call to net_hook_unregister will block until it is lowered.
+ * Because the callbacks want the same lock for the netstack that is held by the
+ * administrative path calling into net_hook_unregister, we deadlock.
+ *
+ *  ioctl from ipdadm remove      hook_t cb (from nic)       hook_t cb (from IP)
+ *  -----------------------       --------------------       -------------------
+ *       |                             |                             |
+ *       |                        bump hook_t refcount               |
+ *  mutex_enter(ipd_nsl_lock);    enter ipd_hook()          bump hook_t refcount
+ *  mutex acquired                mutex_enter(ins->ipdn_lock);       |
+ *       |                        mutex acquired            enter ipd_hook()
+ *  mutex_enter(ins->ipdn_lock);       |            mutex_enter(ins->ipdn_lock);
+ *       |                             |                             |
+ *       |                             |                             |
+ *       |                        mutex_exit(ins->ipdn_lock);        |
+ *       |                             |                             |
+ *  mutex acquired                leave ipd_hook()                   |
+ *       |                        decrement hook_t refcount          |
+ *       |                             |                             |
+ *  ipd_teardown_hooks()               |                             |
+ *  net_hook_unregister()              |                             |
+ *  cv_wait() if recount               |                             |
+ *       |                             |                             |
+ *  ---------------------------------------------------------------------------
+ *
+ * At this point, we can see that the second hook callback still doesn't have
+ * the mutex, but it has bumped the hook_t refcount. However, it will never
+ * acquire the mutex that it needs to finish its operation and decrement the
+ * refcount.
+ *
+ * Obviously, deadlocking is not acceptable, thus the following corollary to the
+ * second locking rule:
+ *
+ * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem,
+ *                N must be held.
+ *
+ * There is currently only one path where we have to worry about this. That is
+ * when we are removing a hook, but the zone is not being shutdown, then hooks
+ * are currently active. The only place that this currently happens is in
+ * ipd_check_hooks().
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/kstat.h>
+#include <sys/neti.h>
+#include <sys/list.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/model.h>
+#include <sys/strsun.h>
+
+#include <sys/netstack.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include <sys/ipd.h>
+
+#define	IPDN_STATUS_DISABLED	0x1
+#define	IPDN_STATUS_ENABLED	0x2
+#define	IPDN_STATUS_CONDEMNED	0x4
+
+/*
+ * These flags are used to determine whether or not the hooks are registered.
+ */
+#define	IPDN_HOOK_NONE		0x0
+#define	IPDN_HOOK_V4IN		0x1
+#define	IPDN_HOOK_V4OUT		0x2
+#define	IPDN_HOOK_V6IN		0x4
+#define	IPDN_HOOK_V6OUT		0x8
+#define	IPDN_HOOK_ALL		0xf
+
+/*
+ * Per-netstack kstats.
+ */
+typedef struct ipd_nskstat {
+	kstat_named_t	ink_ndrops;
+	kstat_named_t	ink_ncorrupts;
+	kstat_named_t	ink_ndelays;
+} ipd_nskstat_t;
+
+/*
+ * Different parts of this structure have different locking semantics. The list
+ * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock.
+ * The following members are read only: ipdn_netid and ipdn_zoneid. The members
+ * of the kstat structure are always accessible in the data path, but the
+ * counters must be bumped with atomic operations. The ipdn_lock protects every
+ * other aspect of this structure. Please see the big theory statement on the
+ * requirements for lock ordering.
+ */
+typedef struct ipd_netstack {
+	list_node_t	ipdn_link;		/* link on ipd_nsl */
+	netid_t		ipdn_netid;		/* netstack id */
+	zoneid_t	ipdn_zoneid;		/* zone id */
+	kstat_t		*ipdn_kstat;		/* kstat_t ptr */
+	ipd_nskstat_t	ipdn_ksdata;		/* kstat data */
+	kmutex_t	ipdn_lock;		/* protects following members */
+	int		ipdn_status;		/* status flags */
+	net_handle_t	ipdn_v4hdl;		/* IPv4 net handle */
+	net_handle_t	ipdn_v6hdl;		/* IPv4 net handle */
+	int		ipdn_hooked;		/* are hooks registered */
+	hook_t		*ipdn_v4in;		/* IPv4 traffic in hook */
+	hook_t		*ipdn_v4out;		/* IPv4 traffice out hook */
+	hook_t		*ipdn_v6in;		/* IPv6 traffic in hook */
+	hook_t		*ipdn_v6out;		/* IPv6 traffic out hook */
+	int		ipdn_enabled;		/* which perturbs are on */
+	int		ipdn_corrupt;		/* corrupt percentage */
+	int		ipdn_drop;		/* drop percentage */
+	uint_t		ipdn_delay;		/* delay us */
+	long		ipdn_rand;		/* random seed */
+} ipd_netstack_t;
+
+/*
+ * ipd internal variables
+ */
+static dev_info_t	*ipd_devi;		/* device info */
+static net_instance_t	*ipd_neti;		/* net_instance for hooks */
+static unsigned int	ipd_max_delay = IPD_MAX_DELAY;	/* max delay in us */
+static kmutex_t		ipd_nsl_lock;		/* lock for the nestack list */
+static list_t		ipd_nsl;		/* list of netstacks */
+static kmutex_t		ipd_nactive_lock;	/* lock for nactive */
+static unsigned int	ipd_nactive; 		/* number of active netstacks */
+static int		ipd_nactive_fudge = 4;	/* amount to fudge by in list */
+
+/*
+ * Note that this random number implementation is based upon the old BSD 4.1
+ * rand. It's good enough for us!
+ */
+static int
+ipd_nextrand(ipd_netstack_t *ins)
+{
+	ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345;
+	return (ins->ipdn_rand & 0x7fffffff);
+}
+
+static void
+ipd_ksbump(kstat_named_t *nkp)
+{
+	atomic_inc_64(&nkp->value.ui64);
+}
+
+/*
+ * This is where all the magic actually happens. The way that this works is we
+ * grab the ins lock to basically get a copy of all the data that we need to do
+ * our job and then let it go to minimize contention. In terms of actual work on
+ * the packet we do them in the following order:
+ *
+ * - drop
+ * - delay
+ * - corrupt
+ */
+/*ARGSUSED*/
+static int
+ipd_hook(hook_event_token_t event, hook_data_t data, void *arg)
+{
+	unsigned char *crp;
+	int dwait, corrupt, drop, rand, off, status;
+	mblk_t *mbp;
+	ipd_netstack_t *ins = arg;
+	hook_pkt_event_t *pkt = (hook_pkt_event_t *)data;
+
+	mutex_enter(&ins->ipdn_lock);
+	status = ins->ipdn_status;
+	dwait = ins->ipdn_delay;
+	corrupt = ins->ipdn_corrupt;
+	drop = ins->ipdn_drop;
+	rand = ipd_nextrand(ins);
+	mutex_exit(&ins->ipdn_lock);
+
+	/*
+	 * This probably cannot happen, but we'll do an extra guard just in
+	 * case.
+	 */
+	if (status & IPDN_STATUS_CONDEMNED)
+		return (0);
+
+	if (drop != 0 && rand % 100 < drop) {
+		freemsg(*pkt->hpe_mp);
+		*pkt->hpe_mp = NULL;
+		pkt->hpe_mb = NULL;
+		pkt->hpe_hdr = NULL;
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops);
+
+		return (1);
+	}
+
+	if (dwait != 0) {
+		if (dwait < TICK_TO_USEC(1))
+			drv_usecwait(dwait);
+		else
+			delay(drv_usectohz(dwait));
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays);
+	}
+
+	if (corrupt != 0 && rand % 100 < corrupt) {
+		/*
+		 * Since we're corrupting the mblk, just corrupt everything in
+		 * the chain. While we could corrupt the entire packet, that's a
+		 * little strong. Instead we're going to just change one of the
+		 * bytes in each mblock.
+		 */
+		mbp = *pkt->hpe_mp;
+		while (mbp != NULL) {
+			if (mbp->b_wptr == mbp->b_rptr)
+				continue;
+
+			/*
+			 * While pfhooks probably won't send us anything else,
+			 * let's just be extra careful. The stack probably isn't
+			 * as resiliant to corruption of control messages.
+			 */
+			if (DB_TYPE(mbp) != M_DATA)
+				continue;
+
+			off = rand % ((uintptr_t)mbp->b_wptr -
+			    (uintptr_t)mbp->b_rptr);
+			crp = mbp->b_rptr + off;
+			off = rand % 8;
+			*crp = *crp ^ (1 << off);
+
+			mbp = mbp->b_cont;
+		}
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts);
+	}
+
+	return (0);
+}
+
+/*
+ * Sets up and registers all the proper hooks needed for the netstack to capture
+ * packets. Callers are assumed to already be holding the ipd_netstack_t's lock.
+ * If there is a failure in setting something up, it is the responsibility of
+ * this function to clean it up. Once this function has been called, it should
+ * not be called until a corresponding call to tear down the hooks has been
+ * done.
+ */
+static int
+ipd_setup_hooks(ipd_netstack_t *ins)
+{
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+	ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET);
+	if (ins->ipdn_v4hdl == NULL)
+		goto cleanup;
+
+	ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6);
+	if (ins->ipdn_v6hdl == NULL)
+		goto cleanup;
+
+	ins->ipdn_v4in = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v4in == NULL)
+		goto cleanup;
+
+	ins->ipdn_v4in->h_flags = 0;
+	ins->ipdn_v4in->h_hint = HH_NONE;
+	ins->ipdn_v4in->h_hintvalue = 0;
+	ins->ipdn_v4in->h_func = ipd_hook;
+	ins->ipdn_v4in->h_arg = ins;
+	ins->ipdn_v4in->h_name = "ipd IPv4 in";
+
+	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v4in) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V4IN;
+
+	ins->ipdn_v4out = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v4out == NULL)
+		goto cleanup;
+	ins->ipdn_v4out->h_flags = 0;
+	ins->ipdn_v4out->h_hint = HH_NONE;
+	ins->ipdn_v4out->h_hintvalue = 0;
+	ins->ipdn_v4out->h_func = ipd_hook;
+	ins->ipdn_v4out->h_arg = ins;
+	ins->ipdn_v4out->h_name = "ipd IPv4 out";
+
+	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v4out) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V4OUT;
+
+	ins->ipdn_v6in = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v6in == NULL)
+		goto cleanup;
+	ins->ipdn_v6in->h_flags = 0;
+	ins->ipdn_v6in->h_hint = HH_NONE;
+	ins->ipdn_v6in->h_hintvalue = 0;
+	ins->ipdn_v6in->h_func = ipd_hook;
+	ins->ipdn_v6in->h_arg = ins;
+	ins->ipdn_v6in->h_name = "ipd IPv6 in";
+
+	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v6in) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V6IN;
+
+	ins->ipdn_v6out = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v6out == NULL)
+		goto cleanup;
+	ins->ipdn_v6out->h_flags = 0;
+	ins->ipdn_v6out->h_hint = HH_NONE;
+	ins->ipdn_v6out->h_hintvalue = 0;
+	ins->ipdn_v6out->h_func = ipd_hook;
+	ins->ipdn_v6out->h_arg = ins;
+	ins->ipdn_v6out->h_name = "ipd IPv6 out";
+
+	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v6out) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V6OUT;
+	mutex_enter(&ipd_nactive_lock);
+	ipd_nactive++;
+	mutex_exit(&ipd_nactive_lock);
+
+	return (0);
+
+cleanup:
+	if (ins->ipdn_hooked & IPDN_HOOK_V6OUT)
+		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+		    ins->ipdn_v6out);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V6IN)
+		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+		    ins->ipdn_v6in);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V4OUT)
+		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+		    ins->ipdn_v4out);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V4IN)
+		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+		    ins->ipdn_v4in);
+
+	ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+	if (ins->ipdn_v6out != NULL)
+		hook_free(ins->ipdn_v6out);
+
+	if (ins->ipdn_v6in != NULL)
+		hook_free(ins->ipdn_v6in);
+
+	if (ins->ipdn_v4out != NULL)
+		hook_free(ins->ipdn_v4out);
+
+	if (ins->ipdn_v4in != NULL)
+		hook_free(ins->ipdn_v4in);
+
+	if (ins->ipdn_v6hdl != NULL)
+		(void) net_protocol_release(ins->ipdn_v6hdl);
+
+	if (ins->ipdn_v4hdl != NULL)
+		(void) net_protocol_release(ins->ipdn_v4hdl);
+
+	return (1);
+}
+
+static void
+ipd_teardown_hooks(ipd_netstack_t *ins)
+{
+	ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL);
+	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v6out) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v6in) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v4out) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v4in) == 0);
+
+	ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+	hook_free(ins->ipdn_v6out);
+	hook_free(ins->ipdn_v6in);
+	hook_free(ins->ipdn_v4out);
+	hook_free(ins->ipdn_v4in);
+
+	VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0);
+	VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0);
+
+	mutex_enter(&ipd_nactive_lock);
+	ipd_nactive--;
+	mutex_exit(&ipd_nactive_lock);
+}
+
+static int
+ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable)
+{
+	int olden, rval;
+	olden = ins->ipdn_enabled;
+
+	if (enable)
+		ins->ipdn_enabled |= type;
+	else
+		ins->ipdn_enabled &= ~type;
+
+	/*
+	 * If hooks were previously enabled.
+	 */
+	if (olden == 0 && ins->ipdn_enabled != 0) {
+		rval = ipd_setup_hooks(ins);
+		if (rval != 0) {
+			ins->ipdn_enabled &= ~type;
+			ASSERT(ins->ipdn_enabled == 0);
+			return (rval);
+		}
+
+		return (0);
+	}
+
+	if (olden != 0 && ins->ipdn_enabled == 0) {
+		ASSERT(olden != 0);
+
+		/*
+		 * We have to drop the lock here, lest we cause a deadlock.
+		 * Unfortunately, there may be hooks that are running and are
+		 * actively in flight and we have to call the unregister
+		 * function. Due to the hooks framework, if there is an inflight
+		 * hook (most likely right now), and we are holding the
+		 * netstack's lock, those hooks will never return. This is
+		 * unfortunate.
+		 *
+		 * Because we only come into this path holding the list lock, we
+		 * know that only way that someone else can come in and get to
+		 * this structure is via the hook callbacks which are going to
+		 * only be doing reads. They'll also see that everything has
+		 * been disabled and return. So while this is unfortunate, it
+		 * should be relatively safe.
+		 */
+		mutex_exit(&ins->ipdn_lock);
+		ipd_teardown_hooks(ins);
+		mutex_enter(&ins->ipdn_lock);
+		return (0);
+	}
+
+	/*
+	 * Othwerise, nothing should have changed here.
+	 */
+	ASSERT((olden == 0) == (ins->ipdn_enabled == 0));
+	return (0);
+}
+
+static int
+ipd_toggle_corrupt(ipd_netstack_t *ins, int percent)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (percent < 0 || percent > 100)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (percent == ins->ipdn_corrupt)
+		return (0);
+
+	ins->ipdn_corrupt = percent;
+	rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_corrupt = 0;
+
+	return (rval);
+}
+
+static int
+ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (delay > ipd_max_delay)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (delay == ins->ipdn_delay)
+		return (0);
+
+	ins->ipdn_delay = delay;
+	rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_delay = 0;
+
+	return (rval);
+}
+static int
+ipd_toggle_drop(ipd_netstack_t *ins, int percent)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (percent < 0 || percent > 100)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (percent == ins->ipdn_drop)
+		return (0);
+
+	ins->ipdn_drop = percent;
+	rval = ipd_check_hooks(ins, IPD_DROP, percent != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_drop = 0;
+
+	return (rval);
+}
+
+static int
+ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd)
+{
+	zoneid_t zid;
+	ipd_netstack_t *ins;
+	int rval = 0;
+
+	/*
+	 * If the zone that we're coming from is not the GZ, then we ignore it
+	 * completely and then instead just set the zoneid to be that of the
+	 * caller. If the zoneid is that of the GZ, then we don't touch this
+	 * value.
+	 */
+	zid = crgetzoneid(cr);
+	if (zid != GLOBAL_ZONEID)
+		ipi->ipip_zoneid = zid;
+
+	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+	    zid != GLOBAL_ZONEID)
+		return (EPERM);
+
+	/*
+	 * We need to hold the ipd_nsl_lock throughout the entire operation,
+	 * otherwise someone else could come in and remove us from the list and
+	 * free us, e.g. the netstack destroy handler. By holding the lock, we
+	 * stop it from being able to do anything wrong.
+	 */
+	mutex_enter(&ipd_nsl_lock);
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+			break;
+	}
+
+	if (ins == NULL) {
+		mutex_exit(&ipd_nsl_lock);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ins->ipdn_lock);
+
+	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+		rval = ESHUTDOWN;
+		goto cleanup;
+	}
+
+	switch (cmd) {
+	case IPDIOC_CORRUPT:
+		rval = ipd_toggle_corrupt(ins, ipi->ipip_arg);
+		break;
+	case IPDIOC_DELAY:
+		rval = ipd_toggle_delay(ins, ipi->ipip_arg);
+		break;
+	case IPDIOC_DROP:
+		rval = ipd_toggle_drop(ins, ipi->ipip_arg);
+		break;
+	}
+
+cleanup:
+	mutex_exit(&ins->ipdn_lock);
+	mutex_exit(&ipd_nsl_lock);
+	return (rval);
+}
+
+static int
+ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr)
+{
+	zoneid_t zid;
+	ipd_netstack_t *ins;
+	int rval = 0;
+
+	/*
+	 * See ipd_ioctl_perturb for the rational here.
+	 */
+	zid = crgetzoneid(cr);
+	if (zid != GLOBAL_ZONEID)
+		ipi->ipip_zoneid = zid;
+
+	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+	    zid != GLOBAL_ZONEID)
+		return (EPERM);
+
+	mutex_enter(&ipd_nsl_lock);
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+			break;
+	}
+
+	if (ins == NULL) {
+		mutex_exit(&ipd_nsl_lock);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ins->ipdn_lock);
+
+	/*
+	 * If this is condemned, that means it's very shortly going to be torn
+	 * down. In that case, there's no reason to actually do anything here,
+	 * as it will all be done rather shortly in the destroy function.
+	 * Furthermore, because condemned corresponds with it having hit
+	 * shutdown, we know that no more packets can be received by this
+	 * netstack. All this translates to a no-op.
+	 */
+	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+		rval = 0;
+		goto cleanup;
+	}
+
+	rval = EINVAL;
+	/*
+	 * Go through and disable the requested pieces. We can safely ignore the
+	 * return value of ipd_check_hooks because the removal case should never
+	 * fail, we verify that in the hook teardown case.
+	 */
+	if (ipi->ipip_arg & IPD_CORRUPT) {
+		ins->ipdn_corrupt = 0;
+		(void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE);
+		rval = 0;
+	}
+
+	if (ipi->ipip_arg & IPD_DELAY) {
+		ins->ipdn_delay = 0;
+		(void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE);
+		rval = 0;
+	}
+
+	if (ipi->ipip_arg & IPD_DROP) {
+		ins->ipdn_drop = 0;
+		(void) ipd_check_hooks(ins, IPD_DROP, B_FALSE);
+		rval = 0;
+	}
+
+cleanup:
+	mutex_exit(&ins->ipdn_lock);
+	mutex_exit(&ipd_nsl_lock);
+	return (rval);
+}
+
+/*
+ * When this function is called, the value of the ipil_nzones argument controls
+ * how this function works. When called with a value of zero, then we treat that
+ * as the caller asking us what's a reasonable number of entries for me to
+ * allocate memory for. If the zone is the global zone, then we tell them how
+ * many folks are currently active and add a fudge factor. Otherwise the answer
+ * is always one.
+ *
+ * In the non-zero case, we give them that number of zone ids. While this isn't
+ * quite ideal as it might mean that someone misses something, this generally
+ * won't be an issue, as it involves a rather tight race condition in the
+ * current ipdadm implementation.
+ */
+static int
+ipd_ioctl_list(intptr_t arg, cred_t *cr)
+{
+	zoneid_t zid;
+	ipd_ioc_info_t *configs;
+	ipd_netstack_t *ins;
+	uint_t azones, rzones, nzones, cur;
+	int rval = 0;
+	STRUCT_DECL(ipd_ioc_list, h);
+
+	STRUCT_INIT(h, get_udatamodel());
+	if (ddi_copyin((void *)arg, STRUCT_BUF(h),
+	    STRUCT_SIZE(h), 0) != 0)
+		return (EFAULT);
+
+	zid = crgetzoneid(cr);
+
+	rzones = STRUCT_FGET(h, ipil_nzones);
+	if (rzones == 0) {
+		if (zid == GLOBAL_ZONEID) {
+			mutex_enter(&ipd_nactive_lock);
+			rzones = ipd_nactive + ipd_nactive_fudge;
+			mutex_exit(&ipd_nactive_lock);
+		} else {
+			rzones = 1;
+		}
+		STRUCT_FSET(h, ipil_nzones, rzones);
+		if (ddi_copyout(STRUCT_BUF(h), (void *)arg,
+		    STRUCT_SIZE(h), 0) != 0)
+			return (EFAULT);
+
+		return (0);
+	}
+
+	mutex_enter(&ipd_nsl_lock);
+	if (zid == GLOBAL_ZONEID) {
+		azones = ipd_nactive;
+	} else {
+		azones = 1;
+	}
+
+	configs = kmem_alloc(sizeof (ipd_ioc_info_t) * azones, KM_SLEEP);
+	cur = 0;
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_enabled == 0)
+			continue;
+
+		ASSERT(cur < azones);
+
+		if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) {
+			configs[cur].ipii_zoneid = ins->ipdn_zoneid;
+
+			mutex_enter(&ins->ipdn_lock);
+			configs[cur].ipii_corrupt = ins->ipdn_corrupt;
+			configs[cur].ipii_delay = ins->ipdn_delay;
+			configs[cur].ipii_drop = ins->ipdn_drop;
+			mutex_exit(&ins->ipdn_lock);
+
+			++cur;
+		}
+
+		if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid)
+			break;
+	}
+	mutex_exit(&ipd_nsl_lock);
+
+	ASSERT(zid != GLOBAL_ZONEID || cur == azones);
+
+	if (cur == 0)
+		STRUCT_FSET(h, ipil_nzones, 0);
+	else
+		STRUCT_FSET(h, ipil_nzones, cur);
+
+	nzones = MIN(cur, rzones);
+	if (nzones > 0) {
+		if (ddi_copyout(configs, STRUCT_FGETP(h, ipil_info),
+		    nzones * sizeof (ipd_ioc_info_t), NULL) != 0)
+			rval = EFAULT;
+	}
+
+	kmem_free(configs, sizeof (ipd_ioc_info_t) * azones);
+	if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0)
+		return (EFAULT);
+
+	return (rval);
+}
+
+static void *
+ipd_nin_create(const netid_t id)
+{
+	ipd_netstack_t *ins;
+	ipd_nskstat_t *ink;
+
+	ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP);
+	ins->ipdn_status = IPDN_STATUS_DISABLED;
+	ins->ipdn_netid = id;
+	ins->ipdn_zoneid = netstackid_to_zoneid(id);
+	ins->ipdn_rand = gethrtime();
+	mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL);
+
+	ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid,
+	    "ipd", "net",  KSTAT_TYPE_NAMED,
+	    sizeof (ipd_nskstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ins->ipdn_kstat != NULL) {
+		if (ins->ipdn_zoneid != GLOBAL_ZONEID)
+			kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID);
+
+		ink = &ins->ipdn_ksdata;
+		ins->ipdn_kstat->ks_data = ink;
+		kstat_named_init(&ink->ink_ncorrupts, "corrupts",
+		    KSTAT_DATA_UINT64);
+		kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64);
+		kstat_named_init(&ink->ink_ndelays, "delays",
+		    KSTAT_DATA_UINT64);
+		kstat_install(ins->ipdn_kstat);
+	}
+
+	mutex_enter(&ipd_nsl_lock);
+	list_insert_tail(&ipd_nsl, ins);
+	mutex_exit(&ipd_nsl_lock);
+
+	return (ins);
+}
+
+static void
+ipd_nin_shutdown(const netid_t id, void *arg)
+{
+	ipd_netstack_t *ins = arg;
+
+	VERIFY(id == ins->ipdn_netid);
+	mutex_enter(&ins->ipdn_lock);
+	ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED ||
+	    ins->ipdn_status == IPDN_STATUS_ENABLED);
+	ins->ipdn_status |= IPDN_STATUS_CONDEMNED;
+	if (ins->ipdn_kstat != NULL)
+		net_kstat_delete(id, ins->ipdn_kstat);
+	mutex_exit(&ins->ipdn_lock);
+}
+
+/*ARGSUSED*/
+static void
+ipd_nin_destroy(const netid_t id, void *arg)
+{
+	ipd_netstack_t *ins = arg;
+
+	/*
+	 * At this point none of the hooks should be able to fire because the
+	 * zone has been shutdown and we are in the process of destroying it.
+	 * Thus it should not be possible for someone else to come in and grab
+	 * our ipd_netstack_t for this zone. Because of that, we know that we
+	 * are the only ones who could be running here.
+	 */
+	mutex_enter(&ipd_nsl_lock);
+	list_remove(&ipd_nsl, ins);
+	mutex_exit(&ipd_nsl_lock);
+
+	if (ins->ipdn_hooked)
+		ipd_teardown_hooks(ins);
+	mutex_destroy(&ins->ipdn_lock);
+	kmem_free(ins, sizeof (ipd_netstack_t));
+}
+
+/*ARGSUSED*/
+static int
+ipd_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	if (flag & FEXCL || flag & FNDELAY)
+		return (EINVAL);
+
+	if (otype != OTYP_CHR)
+		return (EINVAL);
+
+	if (!(flag & FREAD && flag & FWRITE))
+		return (EINVAL);
+
+	if (secpolicy_ip_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+	int rval;
+	ipd_ioc_perturb_t ipip;
+	ipd_ioc_info_t ipii;
+
+	switch (cmd) {
+	case IPDIOC_CORRUPT:
+	case IPDIOC_DELAY:
+	case IPDIOC_DROP:
+		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+		    0) != 0)
+			return (EFAULT);
+		rval = ipd_ioctl_perturb(&ipip, cr, cmd);
+		return (rval);
+	case IPDIOC_REMOVE:
+		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+		    0) != 0)
+			return (EFAULT);
+		rval = ipd_ioctl_remove(&ipip, cr);
+		return (rval);
+	case IPDIOC_LIST:
+		/*
+		 * Because the list ioctl doesn't have a fixed-size struct due
+		 * to needing to pass around a pointer, we instead delegate the
+		 * copyin logic to the list code.
+		 */
+		return (ipd_ioctl_list(arg, cr));
+	default:
+		break;
+	}
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+ipd_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	return (0);
+}
+
+static int
+ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	minor_t instance;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ipd_devi != NULL)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance,
+	    DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	ipd_neti = net_instance_alloc(NETINFO_VERSION);
+	if (ipd_neti == NULL) {
+		ddi_remove_minor_node(dip, NULL);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Note that these global structures MUST be initialized before we call
+	 * net_instance_register, as that will instantly cause us to drive into
+	 * the ipd_nin_create callbacks.
+	 */
+	list_create(&ipd_nsl, sizeof (ipd_netstack_t),
+	    offsetof(ipd_netstack_t, ipdn_link));
+	mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Note, net_instance_alloc sets the version. */
+	ipd_neti->nin_name = "ipd";
+	ipd_neti->nin_create = ipd_nin_create;
+	ipd_neti->nin_destroy = ipd_nin_destroy;
+	ipd_neti->nin_shutdown = ipd_nin_shutdown;
+	if (net_instance_register(ipd_neti) == DDI_FAILURE) {
+		net_instance_free(ipd_neti);
+		ddi_remove_minor_node(dip, NULL);
+	}
+
+	ddi_report_dev(dip);
+	ipd_devi = dip;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	int error;
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = ipd_devi;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)getminor((dev_t)arg);
+		error = DDI_SUCCESS;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ASSERT(dip == ipd_devi);
+	ddi_remove_minor_node(dip, NULL);
+	ipd_devi = NULL;
+
+	if (ipd_neti != NULL) {
+		VERIFY(net_instance_unregister(ipd_neti) == 0);
+		net_instance_free(ipd_neti);
+	}
+
+	mutex_destroy(&ipd_nsl_lock);
+	mutex_destroy(&ipd_nactive_lock);
+	list_destroy(&ipd_nsl);
+
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops ipd_cb_ops = {
+	ipd_open,	/* open */
+	ipd_close,	/* close */
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	ipd_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	nodev,		/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,	/* cb_prop_op */
+	NULL,		/* streamtab */
+	D_NEW | D_MP,	/* Driver compatibility flag */
+	CB_REV,		/* rev */
+	nodev,		/* aread */
+	nodev		/* awrite */
+};
+
+static struct dev_ops ipd_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	ipd_getinfo,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	ipd_attach,		/* attach */
+	ipd_detach,		/* detach */
+	nodev,			/* reset */
+	&ipd_cb_ops,		/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed	/* quiesce */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"Internet packet disturber",
+	&ipd_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	{ (void *)&modldrv, NULL }
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/inet/ipd/ipd.conf b/usr/src/uts/common/inet/ipd/ipd.conf
new file mode 100644
index 0000000000..83b9b685f4
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+name="ipd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index 98cda0b7cc..75bac21ae4 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -141,11 +141,13 @@ ipf_stack_t *ifs;
 
 #define	UNDO_HOOK(_f, _b, _e, _h)					\
 	do {								\
+	 	int tmp;						\
 		if (ifs->_f != NULL) {					\
 			if (ifs->_b) {					\
-				ifs->_b = (net_hook_unregister(ifs->_f,	\
-					   _e, ifs->_h) != 0);		\
-				if (!ifs->_b) {				\
+				tmp = net_hook_unregister(ifs->_f,	\
+					   _e, ifs->_h);		\
+				ifs->_b = (tmp != 0 && tmp != ENXIO);	\
+				if (!ifs->_b && ifs->_h != NULL) {	\
 					hook_free(ifs->_h);		\
 					ifs->_h = NULL;			\
 				}					\
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 6d0bf70b2a..2e08dc359b 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,6 +23,10 @@
  */
 
 /*
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
+ */
+
+/*
  * Squeues: General purpose serialization mechanism
  * ------------------------------------------------
  *
@@ -120,6 +124,8 @@
 #include <sys/sdt.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
 
 #include <inet/ipclassifier.h>
 #include <inet/udp_impl.h>
@@ -142,6 +148,9 @@ int squeue_workerwait_ms = 0;
 static int squeue_drain_ns = 0;
 static int squeue_workerwait_tick = 0;
 
+uintptr_t squeue_drain_stack_needed = 10240;
+uint_t squeue_drain_stack_toodeep;
+
 #define	MAX_BYTES_TO_PICKUP	150000
 
 #define	ENQUEUE_CHAIN(sqp, mp, tail, cnt) {			\
@@ -546,6 +555,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
 		ASSERT(sqp->sq_first != NULL);
 		now = gethrtime();
+		sqp->sq_run = curthread;
 		sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
 
 		/*
@@ -711,6 +721,20 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
 	boolean_t	sq_poll_capable;
 	ip_recv_attr_t	*ira, iras;
 
+	/*
+	 * Before doing any work, check our stack depth; if we're not a
+	 * worker thread for this squeue and we're beginning to get tight on
+	 * on stack, kick the worker, bump a counter and return.
+	 */
+	if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() -
+	    (uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) {
+		ASSERT(mutex_owned(&sqp->sq_lock));
+		sqp->sq_awaken = ddi_get_lbolt();
+		cv_signal(&sqp->sq_worker_cv);
+		squeue_drain_stack_toodeep++;
+		return;
+	}
+
 	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
 again:
 	ASSERT(mutex_owned(&sqp->sq_lock));
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 1bb87e5c56..f79427e766 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -3792,7 +3792,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 	ASSERT(error == 0);
 	tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
 	ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
-	cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3857,7 +3858,8 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
 
 	freeb(tcps->tcps_ixa_cleanup_mp);
 	tcps->tcps_ixa_cleanup_mp = NULL;
-	cv_destroy(&tcps->tcps_ixa_cleanup_cv);
+	cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv);
+	cv_destroy(&tcps->tcps_ixa_cleanup_done_cv);
 	mutex_destroy(&tcps->tcps_ixa_cleanup_lock);
 
 	/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index 6e729ff461..e6b13fe6c9 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,12 +21,14 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/tihdr.h>
 #include <sys/policy.h>
 #include <sys/tsol/tnet.h>
+#include <sys/kstat.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
@@ -505,7 +507,7 @@ tcp_kstat_init(netstackid_t stackid)
 		{ "connTableSize6",	KSTAT_DATA_INT32, 0 }
 	};
 
-	ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2",
+	ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, TCP_MOD_NAME, "mib2",
 	    KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid);
 
 	if (ksp == NULL)
@@ -518,6 +520,13 @@ tcp_kstat_init(netstackid_t stackid)
 	ksp->ks_update = tcp_kstat_update;
 	ksp->ks_private = (void *)(uintptr_t)stackid;
 
+	/*
+	 * If this is an exclusive netstack for a local zone, the global zone
+	 * should still be able to read the kstat.
+	 */
+	if (stackid != GLOBAL_NETSTACKID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
 	kstat_install(ksp);
 	return (ksp);
 }
@@ -733,7 +742,7 @@ tcp_kstat2_init(netstackid_t stackid)
 #endif
 	};
 
-	ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net",
+	ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, "tcpstat", "net",
 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 0,
 	    stackid);
 
@@ -744,6 +753,13 @@ tcp_kstat2_init(netstackid_t stackid)
 	ksp->ks_private = (void *)(uintptr_t)stackid;
 	ksp->ks_update = tcp_kstat2_update;
 
+	/*
+	 * If this is an exclusive netstack for a local zone, the global zone
+	 * should still be able to read the kstat.
+	 */
+	if (stackid != GLOBAL_NETSTACKID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
 	kstat_install(ksp);
 	return (ksp);
 }
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index 6f896fa740..adde51e745 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
  */
 
 /*
@@ -111,6 +111,21 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
 	return (B_TRUE);
 }
 
+/* Constants used for fast checking of a localhost address */
+#if defined(_BIG_ENDIAN)
+#define	IPv4_LOCALHOST	0x7f000000U
+#define	IPv4_LH_MASK	0xffffff00U
+#else
+#define	IPv4_LOCALHOST	0x0000007fU
+#define	IPv4_LH_MASK	0x00ffffffU
+#endif
+
+#define	IS_LOCAL_HOST(x)	( \
+	((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
+	((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
+	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
+	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
+
 /*
  * Add a connection to the list of detached TIME_WAIT connections
  * and set its time to expire.
@@ -122,6 +137,7 @@ tcp_time_wait_append(tcp_t *tcp)
 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
 	tcp_squeue_priv_t *tcp_time_wait =
 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
+	hrtime_t firetime = 0;
 
 	tcp_timers_stop(tcp);
 
@@ -138,13 +154,37 @@ tcp_time_wait_append(tcp_t *tcp)
 	ASSERT(tcp->tcp_listener == NULL);
 
 	tcp->tcp_time_wait_expire = ddi_get_lbolt64();
-	/*
-	 * Since tcp_time_wait_expire is lbolt64, it should not wrap around
-	 * in practice.  Hence it cannot be 0.  Note that zero means that the
-	 * tcp_t is not in the TIME_WAIT list.
-	 */
-	tcp->tcp_time_wait_expire += MSEC_TO_TICK(
-	    tcps->tcps_time_wait_interval);
+	if (IS_LOCAL_HOST(tcp)) {
+		/*
+		 * This is the fastpath for handling localhost connections.
+		 * Since we don't have to worry about packets on the localhost
+		 * showing up after a long network delay, we want to expire
+		 * these quickly so the port range on the localhost doesn't
+		 * get starved by short-running, local apps.
+		 *
+		 * Leave tcp_time_wait_expire at the current time. This
+		 * essentially means the connection is expired now and it will
+		 * clean up the next time tcp_time_wait_collector runs.  We set
+		 * firetime to use a short delay so that if we have to start a
+		 * tcp_time_wait_collector thread below, it runs soon instead
+		 * of after a delay of time_wait_interval. firetime being set
+		 * to a non-0 value is also our indicator that we should add
+		 * this connection to the head of the time wait list (since we
+		 * are already expired) so that its sure to get cleaned up on
+		 * the next run of tcp_time_wait_collector (which expects the
+		 * entries to appear in time-order and stops when it hits the
+		 * first non-expired entry).
+		 */
+		firetime = TCP_TIME_WAIT_DELAY;
+	} else {
+		/*
+		 * Since tcp_time_wait_expire is lbolt64, it should not wrap
+		 * around in practice.  Hence it cannot be 0.  Note that zero
+		 * means that the tcp_t is not in the TIME_WAIT list.
+		 */
+		tcp->tcp_time_wait_expire += MSEC_TO_TICK(
+		    tcps->tcps_time_wait_interval);
+	}
 
 	ASSERT(TCP_IS_DETACHED(tcp));
 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
@@ -164,13 +204,17 @@ tcp_time_wait_append(tcp_t *tcp)
 		 * a timer is needed.
 		 */
 		if (tcp_time_wait->tcp_time_wait_tid == 0) {
+			if (firetime == 0)
+				firetime = (hrtime_t)
+				    (tcps->tcps_time_wait_interval + 1) *
+				    MICROSEC;
+
 			tcp_time_wait->tcp_time_wait_tid =
 			    timeout_generic(CALLOUT_NORMAL,
-			    tcp_time_wait_collector, sqp,
-			    (hrtime_t)(tcps->tcps_time_wait_interval + 1) *
-			    MICROSEC, CALLOUT_TCP_RESOLUTION,
-			    CALLOUT_FLAG_ROUNDUP);
+			    tcp_time_wait_collector, sqp, firetime,
+			    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
 		}
+		tcp_time_wait->tcp_time_wait_tail = tcp;
 	} else {
 		/*
 		 * The list is not empty, so a timer must be running.  If not,
@@ -182,11 +226,23 @@ tcp_time_wait_append(tcp_t *tcp)
 		ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
 		ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
 		    TCPS_TIME_WAIT);
-		tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
-		tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
 
+		if (firetime == 0) {
+			/* add at end */
+			tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
+			    tcp;
+			tcp->tcp_time_wait_prev =
+			    tcp_time_wait->tcp_time_wait_tail;
+			tcp_time_wait->tcp_time_wait_tail = tcp;
+		} else {
+			/* add at head */
+			tcp->tcp_time_wait_next =
+			    tcp_time_wait->tcp_time_wait_head;
+			tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
+			    tcp;
+			tcp_time_wait->tcp_time_wait_head = tcp;
+		}
 	}
-	tcp_time_wait->tcp_time_wait_tail = tcp;
 	mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
 }
 
@@ -416,6 +472,10 @@ tcp_time_wait_collector(void *arg)
 	    tcp_time_wait->tcp_time_wait_tid == 0) {
 		hrtime_t firetime;
 
+		/* shouldn't be necessary, but just in case */
+		if (tcp->tcp_time_wait_expire < now)
+			tcp->tcp_time_wait_expire = now;
+
 		firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
 		/* This ensures that we won't wake up too often. */
 		firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 2dccf6b78c..e46ebe08da 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -101,7 +101,8 @@ struct tcp_stack {
 	/* Used to synchronize access when reclaiming memory */
 	mblk_t		*tcps_ixa_cleanup_mp;
 	kmutex_t	tcps_ixa_cleanup_lock;
-	kcondvar_t	tcps_ixa_cleanup_cv;
+	kcondvar_t	tcps_ixa_cleanup_ready_cv;
+	kcondvar_t	tcps_ixa_cleanup_done_cv;
 
 	/* Variables for handling kmem reclaim call back. */
 	kmutex_t	tcps_reclaim_lock;
diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c
index 00545d2c03..a39110255a 100644
--- a/usr/src/uts/common/io/aggr/aggr_port.c
+++ b/usr/src/uts/common/io/aggr/aggr_port.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 /*
@@ -528,8 +529,13 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on)
 
 	if (on) {
 		mac_rx_clear(port->lp_mch);
+		/* We use the promisc callback because without hardware
+		 * rings, we deliver through flows that will cause duplicate
+		 * delivery of packets when we've flipped into this mode
+		 * to compensate for the lack of hardware MAC matching
+		 */
 		rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL,
-		    aggr_recv_cb, port, &port->lp_mphp,
+		    aggr_recv_promisc_cb, port, &port->lp_mphp,
 		    MAC_PROMISC_FLAGS_NO_TX_LOOP);
 		if (rc != 0) {
 			mac_rx_set(port->lp_mch, aggr_recv_cb, port);
diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c
index 2bdb7872e3..0dfe234b70 100644
--- a/usr/src/uts/common/io/aggr/aggr_recv.c
+++ b/usr/src/uts/common/io/aggr/aggr_recv.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 /*
@@ -68,16 +69,27 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp)
 
 /*
  * Callback function invoked by MAC service module when packets are
- * made available by a MAC port.
+ * made available by a MAC port, both in promisc_on mode and not.
  */
 /* ARGSUSED */
-void
-aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
-    boolean_t loopback)
+static void
+aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback, boolean_t promisc_path)
 {
 	aggr_port_t *port = (aggr_port_t *)arg;
 	aggr_grp_t *grp = port->lp_grp;
 
+	/* In the case where lp_promisc_on has been turned on to
+	 * compensate for insufficient hardware MAC matching and
+	 * hardware rings are not in use we will fall back to
+	 * using flows for delivery which can result in duplicates
+	 * pushed up the stack. Only respect the chosen path.
+	 */
+	if (port->lp_promisc_on != promisc_path) {
+		freemsgchain(mp);
+		return;
+	}
+
 	if (grp->lg_lacp_mode == AGGR_LACP_OFF) {
 		aggr_mac_rx(grp->lg_mh, mrh, mp);
 	} else {
@@ -161,3 +173,19 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
 		}
 	}
 }
+
+/* ARGSUSED */
+void
+aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE);
+}
+
+/* ARGSUSED */
+void
+aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+    boolean_t loopback)
+{
+	aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE);
+}
diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c
index f687ce4892..a459f867f3 100644
--- a/usr/src/uts/common/io/bge/bge_chip2.c
+++ b/usr/src/uts/common/io/bge/bge_chip2.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011, 2012 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include "bge_impl.h"
@@ -363,7 +363,34 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma)
 	if (DEVICE_5717_SERIES_CHIPSETS(bgep))
 		pci_config_put32(handle, PCI_CONF_BGE_MHCR, 0);
 	mhcr = pci_config_get32(handle, PCI_CONF_BGE_MHCR);
-	cidp->asic_rev = mhcr & MHCR_CHIP_REV_MASK;
+	cidp->asic_rev = (mhcr & MHCR_CHIP_REV_MASK) >> MHCR_CHIP_REV_SHIFT;
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_PRODID) {
+		uint32_t reg;
+		switch (cidp->device) {
+		case DEVICE_ID_5717:
+		case DEVICE_ID_5718:
+		case DEVICE_ID_5719:
+		case DEVICE_ID_5720:
+			reg = PCI_CONF_GEN2_PRODID_ASICREV;
+			break;
+		case DEVICE_ID_57781:
+		case DEVICE_ID_57785:
+		case DEVICE_ID_57761:
+		case DEVICE_ID_57765:
+		case DEVICE_ID_57791:
+		case DEVICE_ID_57795:
+		case DEVICE_ID_57762:
+		case DEVICE_ID_57766:
+		case DEVICE_ID_57782:
+		case DEVICE_ID_57786:
+			reg = PCI_CONF_GEN15_PRODID_ASICREV;
+			break;
+		default:
+			reg = PCI_CONF_PRODID_ASICREV;
+			break;
+		}
+		cidp->asic_rev = pci_config_get32(handle, reg);
+	}
 	cidp->businfo = pci_config_get32(handle, PCI_CONF_BGE_PCISTATE);
 	cidp->command = pci_config_get16(handle, PCI_CONF_COMM);
 
@@ -386,6 +413,45 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma)
 	BGE_DEBUG(("bge_chip_cfg_init: clsize %d latency %d command 0x%x",
 	    cidp->clsize, cidp->latency, cidp->command));
 
+	cidp->chip_type = 0;
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5717 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5719 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5720)
+		cidp->chip_type |= CHIP_TYPE_5717_PLUS;
+
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57765 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57766)
+		cidp->chip_type |= CHIP_TYPE_57765_CLASS;
+
+	if (cidp->chip_type & CHIP_TYPE_57765_CLASS ||
+	    cidp->chip_type & CHIP_TYPE_5717_PLUS)
+		cidp->chip_type |= CHIP_TYPE_57765_PLUS;
+
+	/* Intentionally exclude ASIC_REV_5906 */
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5755 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5787 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5784 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5761 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5785 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57780 ||
+	    cidp->chip_type & CHIP_TYPE_57765_PLUS)
+		cidp->chip_type |= CHIP_TYPE_5755_PLUS;
+
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5780 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5714)
+		cidp->chip_type |= CHIP_TYPE_5780_CLASS;
+
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5750 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5752 ||
+	    MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5906 ||
+	    cidp->chip_type & CHIP_TYPE_5755_PLUS ||
+	    cidp->chip_type & CHIP_TYPE_5780_CLASS)
+		cidp->chip_type |= CHIP_TYPE_5750_PLUS;
+
+	if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5705 ||
+	    cidp->chip_type & CHIP_TYPE_5750_PLUS)
+		cidp->chip_type |= CHIP_TYPE_5705_PLUS;
+
 	/*
 	 * Step 2 (also step 6): disable and clear interrupts.
 	 * Steps 11-13: configure PIO endianness options, and enable
@@ -445,8 +511,9 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma)
 	 * see whether the host is truly up to date, and regenerate
 	 * its interrupt if not.
 	 */
-	mhcr =	MHCR_ENABLE_INDIRECT_ACCESS |
+	mhcr = MHCR_ENABLE_INDIRECT_ACCESS |
 	    MHCR_ENABLE_TAGGED_STATUS_MODE |
+	    MHCR_ENABLE_PCI_STATE_WRITE |
 	    MHCR_MASK_INTERRUPT_MODE |
 	    MHCR_CLEAR_INTERRUPT_INTA;
 
@@ -1896,10 +1963,16 @@ bge_nvmem_id(bge_t *bgep)
 	case DEVICE_ID_5705_2:
 	case DEVICE_ID_5717:
 	case DEVICE_ID_5718:
+	case DEVICE_ID_5719:
+	case DEVICE_ID_5720:
 	case DEVICE_ID_5724:
+	case DEVICE_ID_57760:
 	case DEVICE_ID_57780:
+	case DEVICE_ID_57788:
+	case DEVICE_ID_57790:
 	case DEVICE_ID_5780:
 	case DEVICE_ID_5782:
+	case DEVICE_ID_5784M:
 	case DEVICE_ID_5785:
 	case DEVICE_ID_5787:
 	case DEVICE_ID_5787M:
@@ -1918,6 +1991,8 @@ bge_nvmem_id(bge_t *bgep)
 	case DEVICE_ID_5723:
 	case DEVICE_ID_5761:
 	case DEVICE_ID_5761E:
+	case DEVICE_ID_5761S:
+	case DEVICE_ID_5761SE:
 	case DEVICE_ID_5764:
 	case DEVICE_ID_5714C:
 	case DEVICE_ID_5714S:
@@ -2023,14 +2098,35 @@ bge_chip_id_init(bge_t *bgep)
 
 	cidp->msi_enabled = B_FALSE;
 
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) >
+	    MHCR_CHIP_ASIC_REV_PRODID ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5906 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5700 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5701 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5750)
+		/*
+		 * Just a plain reset; the "check" code breaks these chips
+		 */
+		cidp->flags |= CHIP_FLAG_NO_CHECK_RESET;
+
 	switch (cidp->device) {
 	case DEVICE_ID_5717:
 	case DEVICE_ID_5718:
+	case DEVICE_ID_5719:
+	case DEVICE_ID_5720:
 	case DEVICE_ID_5724:
 		if (cidp->device == DEVICE_ID_5717)
 			cidp->chip_label = 5717;
 		else if (cidp->device == DEVICE_ID_5718)
 			cidp->chip_label = 5718;
+		else if (cidp->device == DEVICE_ID_5719)
+			cidp->chip_label = 5719;
+		else if (cidp->device == DEVICE_ID_5720)
+			cidp->chip_label = 5720;
 		else
 			cidp->chip_label = 5724;
 		cidp->msi_enabled = bge_enable_msi;
@@ -2044,7 +2140,7 @@ bge_chip_id_init(bge_t *bgep)
 		cidp->mbuf_hi_water = MBUF_HIWAT_5717;
 		cidp->mbuf_base = bge_mbuf_pool_base_5705;
 		cidp->mbuf_length = bge_mbuf_pool_len_5705;
-		cidp->recv_slots = BGE_RECV_SLOTS_5705;
+		cidp->recv_slots = BGE_RECV_SLOTS_5717;
 		cidp->bge_mlcr_default = MLCR_DEFAULT_5717;
 		cidp->rx_rings = BGE_RECV_RINGS_MAX_5705;
 		cidp->tx_rings = BGE_SEND_RINGS_MAX_5705;
@@ -2220,7 +2316,13 @@ bge_chip_id_init(bge_t *bgep)
 	case DEVICE_ID_5723:
 	case DEVICE_ID_5761:
 	case DEVICE_ID_5761E:
+	case DEVICE_ID_5761S:
+	case DEVICE_ID_5761SE:
+	case DEVICE_ID_5784M:
+	case DEVICE_ID_57760:
 	case DEVICE_ID_57780:
+	case DEVICE_ID_57788:
+	case DEVICE_ID_57790:
 		cidp->msi_enabled = bge_enable_msi;
 		/*
 		 * We don't use MSI for BCM5764 and BCM5785, as the
@@ -2234,10 +2336,18 @@ bge_chip_id_init(bge_t *bgep)
 			cidp->chip_label = 5723;
 		else if (cidp->device == DEVICE_ID_5764)
 			cidp->chip_label = 5764;
+		else if (cidp->device == DEVICE_ID_5784M)
+			cidp->chip_label = 5784;
 		else if (cidp->device == DEVICE_ID_5785)
 			cidp->chip_label = 5785;
+		else if (cidp->device == DEVICE_ID_57760)
+			cidp->chip_label = 57760;
 		else if (cidp->device == DEVICE_ID_57780)
 			cidp->chip_label = 57780;
+		else if (cidp->device == DEVICE_ID_57788)
+			cidp->chip_label = 57788;
+		else if (cidp->device == DEVICE_ID_57790)
+			cidp->chip_label = 57790;
 		else
 			cidp->chip_label = 5761;
 		cidp->bge_dma_rwctrl = bge_dma_rwctrl_5721;
@@ -3401,18 +3511,27 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma)
 		mhcr = MHCR_ENABLE_INDIRECT_ACCESS |
 			MHCR_ENABLE_TAGGED_STATUS_MODE |
 			MHCR_MASK_INTERRUPT_MODE |
-			MHCR_MASK_PCI_INT_OUTPUT |
 			MHCR_CLEAR_INTERRUPT_INTA |
 			MHCR_ENABLE_ENDIAN_WORD_SWAP |
 			MHCR_ENABLE_ENDIAN_BYTE_SWAP;
+
+		if (bgep->intr_type == DDI_INTR_TYPE_FIXED)
+			mhcr |= MHCR_MASK_PCI_INT_OUTPUT;
+
 		if (DEVICE_5717_SERIES_CHIPSETS(bgep))
 			pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR,
 					0);
+#else
+		mhcr = MHCR_ENABLE_INDIRECT_ACCESS |
+			MHCR_ENABLE_TAGGED_STATUS_MODE |
+			MHCR_MASK_INTERRUPT_MODE |
+			MHCR_MASK_PCI_INT_OUTPUT |
+			MHCR_CLEAR_INTERRUPT_INTA;
+#endif
 		pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, mhcr);
 		bge_reg_put32(bgep, MEMORY_ARBITER_MODE_REG,
 			bge_reg_get32(bgep, MEMORY_ARBITER_MODE_REG) |
 			MEMORY_ARBITER_ENABLE);
-#endif
 		if (asf_mode == ASF_MODE_INIT) {
 			bge_asf_pre_reset_operations(bgep, BGE_INIT_RESET);
 		} else if (asf_mode == ASF_MODE_SHUTDOWN) {
@@ -3436,9 +3555,13 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma)
 
 	mhcr = MHCR_ENABLE_INDIRECT_ACCESS |
 	    MHCR_ENABLE_TAGGED_STATUS_MODE |
+	    MHCR_ENABLE_PCI_STATE_WRITE |
 	    MHCR_MASK_INTERRUPT_MODE |
-	    MHCR_MASK_PCI_INT_OUTPUT |
 	    MHCR_CLEAR_INTERRUPT_INTA;
+
+	if (bgep->intr_type == DDI_INTR_TYPE_FIXED)
+		mhcr |= MHCR_MASK_PCI_INT_OUTPUT;
+
 #ifdef  _BIG_ENDIAN
 	mhcr |= MHCR_ENABLE_ENDIAN_WORD_SWAP | MHCR_ENABLE_ENDIAN_BYTE_SWAP;
 #endif  /* _BIG_ENDIAN */
@@ -3449,6 +3572,12 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma)
 	if (bgep->asf_enabled)
 		bgep->asf_wordswapped = B_FALSE;
 #endif
+
+	if (DEVICE_IS_5755_PLUS(bgep) ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5752)
+		bge_reg_put32(bgep, GRC_FASTBOOT_PC, 0);
+
 	/*
 	 * NVRAM Corruption Workaround
 	 */
@@ -3508,6 +3637,11 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma)
 #else
 	modeflags = MODE_WORD_SWAP_FRAME | MODE_BYTE_SWAP_FRAME;
 #endif	/* _BIG_ENDIAN */
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5720)
+		modeflags |=
+		    MODE_BYTE_SWAP_B2HRX_DATA | MODE_WORD_SWAP_B2HRX_DATA |
+		    MODE_B2HRX_ENABLE | MODE_HTX2B_ENABLE;
 #ifdef BGE_IPMI_ASF
 	if (bgep->asf_enabled)
 		modeflags |= MODE_HOST_STACK_UP;
@@ -3592,6 +3726,13 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma)
 	 */
 	bge_reg_put32(bgep, ETHERNET_MAC_MODE_REG, 0);
 
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5720) {
+		uint32_t regval = bge_reg_get32(bgep, CPMU_CLCK_ORIDE_REG);
+		bge_reg_put32(bgep, CPMU_CLCK_ORIDE_REG,
+		    regval & ~CPMU_CLCK_ORIDE_MAC_ORIDE_EN);
+	}
+
 	/*
 	 * Step 21: restore cache-line-size, latency timer, and
 	 * subsystem ID registers to their original values (not
@@ -3818,8 +3959,17 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys)
 	/*
 	 * Steps 34-36: enable buffer manager & internal h/w queues
 	 */
-	if (!bge_chip_enable_engine(bgep, BUFFER_MANAGER_MODE_REG,
-	    STATE_MACHINE_ATTN_ENABLE_BIT))
+
+	regval = STATE_MACHINE_ATTN_ENABLE_BIT;
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5719)
+		regval |= BUFF_MGR_NO_TX_UNDERRUN;
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5717 ||
+	    bgep->chipid.asic_rev == MHCR_CHIP_REV_5719_A0 ||
+	    bgep->chipid.asic_rev == MHCR_CHIP_REV_5720_A0)
+		regval |= BUFF_MGR_MBUF_LOW_ATTN_ENABLE;
+	if (!bge_chip_enable_engine(bgep, BUFFER_MANAGER_MODE_REG, regval))
 		retval = DDI_FAILURE;
 	if (!bge_chip_enable_engine(bgep, FTQ_RESET_REG, 0))
 		retval = DDI_FAILURE;
@@ -3913,7 +4063,13 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys)
 	/*
 	 * Step 50: configure the IPG et al
 	 */
-	bge_reg_put32(bgep, MAC_TX_LENGTHS_REG, MAC_TX_LENGTHS_DEFAULT);
+	regval = MAC_TX_LENGTHS_DEFAULT;
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev)
+	    == MHCR_CHIP_ASIC_REV_5720)
+		regval |= bge_reg_get32(bgep, MAC_TX_LENGTHS_REG) &
+		    (MAC_TX_LENGTHS_JMB_FRM_LEN_MSK |
+		    MAC_TX_LENGTHS_CNT_DWN_VAL_MSK);
+	bge_reg_put32(bgep, MAC_TX_LENGTHS_REG, regval);
 
 	/*
 	 * Step 51: configure the default Rx Return Ring
@@ -4068,22 +4224,45 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys)
 			retval = DDI_FAILURE;
 	dma_wrprio = (bge_dma_wrprio << DMA_PRIORITY_SHIFT) |
 	    ALL_DMA_ATTN_BITS;
-	if ((MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
-	    MHCR_CHIP_ASIC_REV_5755) ||
-	    (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
-	    MHCR_CHIP_ASIC_REV_5723) ||
-	    (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
-	    MHCR_CHIP_ASIC_REV_5906)) {
+	if (DEVICE_IS_5755_PLUS(bgep))
 		dma_wrprio |= DMA_STATUS_TAG_FIX_CQ12384;
-	}
 	if (!bge_chip_enable_engine(bgep, WRITE_DMA_MODE_REG,
 	    dma_wrprio))
 		retval = DDI_FAILURE;
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5761 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5784 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5785 ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_57780 ||
+	    DEVICE_IS_57765_PLUS(bgep)) {
+		regval = bge_reg_get32(bgep, READ_DMA_RESERVED_CONTROL_REG);
+		if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+		    MHCR_CHIP_ASIC_REV_5719 ||
+		    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+		    MHCR_CHIP_ASIC_REV_5720) {
+			regval &= ~(RDMA_RSRVCTRL_TXMRGN_MASK |
+			    RDMA_RSRVCTRL_FIFO_LWM_MASK |
+			    RDMA_RSRVCTRL_FIFO_HWM_MASK);
+			regval |= RDMA_RSRVCTRL_TXMRGN_320B |
+			    RDMA_RSRVCTRL_FIFO_LWM_1_5K |
+			    RDMA_RSRVCTRL_FIFO_HWM_1_5K;
+		}
+		bge_reg_put32(bgep, READ_DMA_RESERVED_CONTROL_REG,
+		    regval | RDMA_RSRVCTRL_FIFO_OFLW_FIX);
+	}
 	if (DEVICE_5723_SERIES_CHIPSETS(bgep) ||
 	    DEVICE_5717_SERIES_CHIPSETS(bgep))
 		bge_dma_rdprio = 0;
+	regval = bge_dma_rdprio << DMA_PRIORITY_SHIFT;
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5720)
+		regval |= bge_reg_get32(bgep, READ_DMA_MODE_REG) &
+		    DMA_H2BNC_VLAN_DET;
 	if (!bge_chip_enable_engine(bgep, READ_DMA_MODE_REG,
-	    (bge_dma_rdprio << DMA_PRIORITY_SHIFT) | ALL_DMA_ATTN_BITS))
+	    regval | ALL_DMA_ATTN_BITS))
 		retval = DDI_FAILURE;
 	if (!bge_chip_enable_engine(bgep, RCV_DATA_COMPLETION_MODE_REG,
 	    STATE_MACHINE_ATTN_ENABLE_BIT))
@@ -4116,7 +4295,23 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys)
 	 * Step 88: download firmware -- doesn't apply
 	 * Steps 89-90: enable Transmit & Receive MAC Engines
 	 */
-	if (!bge_chip_enable_engine(bgep, TRANSMIT_MAC_MODE_REG, 0))
+	if (DEVICE_IS_5755_PLUS(bgep) ||
+	    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5906) {
+		regval = bge_reg_get32(bgep, TRANSMIT_MAC_MODE_REG);
+		regval |= TRANSMIT_MODE_MBUF_LOCKUP_FIX;
+	} else {
+		regval = 0;
+	}
+	if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+	    MHCR_CHIP_ASIC_REV_5720) {
+		regval &= ~(TRANSMIT_MODE_HTX2B_JMB_FRM_LEN |
+		    TRANSMIT_MODE_HTX2B_CNT_DN_MODE);
+		regval |= bge_reg_get32(bgep, TRANSMIT_MAC_MODE_REG) &
+		    (TRANSMIT_MODE_HTX2B_JMB_FRM_LEN |
+		    TRANSMIT_MODE_HTX2B_CNT_DN_MODE);
+	}
+	if (!bge_chip_enable_engine(bgep, TRANSMIT_MAC_MODE_REG, regval))
 		retval = DDI_FAILURE;
 #ifdef BGE_IPMI_ASF
 	if (!bgep->asf_enabled) {
@@ -4219,7 +4414,6 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys)
 	if (bgep->intr_type == DDI_INTR_TYPE_FIXED)
 		bge_cfg_clr32(bgep, PCI_CONF_BGE_MHCR,
 		    bgep->chipid.mask_pci_int);
-
 	/*
 	 * All done!
 	 */
diff --git a/usr/src/uts/common/io/bge/bge_hw.h b/usr/src/uts/common/io/bge/bge_hw.h
index f8e6c4d09a..cfcae929dd 100644
--- a/usr/src/uts/common/io/bge/bge_hw.h
+++ b/usr/src/uts/common/io/bge/bge_hw.h
@@ -23,6 +23,10 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
+ */
+
 #ifndef _BGE_HW_H
 #define	_BGE_HW_H
 
@@ -68,9 +72,12 @@ extern "C" {
 #define	DEVICE_ID_5724			0x165c
 #define	DEVICE_ID_5705M			0x165d
 #define	DEVICE_ID_5705MA3		0x165e
+#define	DEVICE_ID_5719			0x1657
+#define	DEVICE_ID_5720			0x165f
 #define	DEVICE_ID_5705F			0x166e
 #define	DEVICE_ID_5780			0x166a
 #define	DEVICE_ID_5782			0x1696
+#define	DEVICE_ID_5784M			0x1698
 #define	DEVICE_ID_5785			0x1699
 #define	DEVICE_ID_5787			0x169b
 #define	DEVICE_ID_5787M			0x1693
@@ -92,12 +99,27 @@ extern "C" {
 #define	DEVICE_ID_5714S			0x1669
 #define	DEVICE_ID_5715C			0x1678
 #define	DEVICE_ID_5715S			0x1679
-#define	DEVICE_ID_5761E			0x1680
 #define	DEVICE_ID_5761			0x1681
+#define	DEVICE_ID_5761E			0x1680
+#define	DEVICE_ID_5761S			0x1688
+#define	DEVICE_ID_5761SE		0x1689
 #define	DEVICE_ID_5764			0x1684
 #define	DEVICE_ID_5906			0x1712
 #define	DEVICE_ID_5906M			0x1713
+#define	DEVICE_ID_57760			0x1690
 #define	DEVICE_ID_57780			0x1692
+#define	DEVICE_ID_57788			0x1691
+#define	DEVICE_ID_57790			0x1694
+#define	DEVICE_ID_57781			0x16b1
+#define	DEVICE_ID_57785			0x16b5
+#define	DEVICE_ID_57761			0x16b0
+#define	DEVICE_ID_57765			0x16b4
+#define	DEVICE_ID_57791			0x16b2
+#define	DEVICE_ID_57795			0x16b6
+#define	DEVICE_ID_57762			0x1682
+#define	DEVICE_ID_57766			0x1686
+#define	DEVICE_ID_57786			0x16b3
+#define	DEVICE_ID_57782			0x16b7
 
 #define	REVISION_ID_5700_B0		0x10
 #define	REVISION_ID_5700_B2		0x12
@@ -189,15 +211,23 @@ extern "C" {
 #define	DEVICE_5717_SERIES_CHIPSETS(bgep) \
 		(bgep->chipid.device == DEVICE_ID_5717) ||\
 		(bgep->chipid.device == DEVICE_ID_5718) ||\
+		(bgep->chipid.device == DEVICE_ID_5719) ||\
+		(bgep->chipid.device == DEVICE_ID_5720) ||\
 		(bgep->chipid.device == DEVICE_ID_5724)
 
 #define	DEVICE_5723_SERIES_CHIPSETS(bgep) \
 		((bgep->chipid.device == DEVICE_ID_5723) ||\
 		(bgep->chipid.device == DEVICE_ID_5761) ||\
 		(bgep->chipid.device == DEVICE_ID_5761E) ||\
+		(bgep->chipid.device == DEVICE_ID_5761S) ||\
+		(bgep->chipid.device == DEVICE_ID_5761SE) ||\
 		(bgep->chipid.device == DEVICE_ID_5764) ||\
+		(bgep->chipid.device == DEVICE_ID_5784M) ||\
 		(bgep->chipid.device == DEVICE_ID_5785) ||\
-		(bgep->chipid.device == DEVICE_ID_57780))
+		(bgep->chipid.device == DEVICE_ID_57760) ||\
+		(bgep->chipid.device == DEVICE_ID_57780) ||\
+		(bgep->chipid.device == DEVICE_ID_57788) ||\
+		(bgep->chipid.device == DEVICE_ID_57790))
 
 #define	DEVICE_5714_SERIES_CHIPSETS(bgep) \
 		((bgep->chipid.device == DEVICE_ID_5714C) ||\
@@ -209,6 +239,20 @@ extern "C" {
 		((bgep->chipid.device == DEVICE_ID_5906) ||\
 		(bgep->chipid.device == DEVICE_ID_5906M))
 
+
+#define	CHIP_TYPE_5705_PLUS   (1 << 0)
+#define	CHIP_TYPE_5750_PLUS   (1 << 1)
+#define	CHIP_TYPE_5780_CLASS  (1 << 2)
+#define	CHIP_TYPE_5755_PLUS   (1 << 3)
+#define	CHIP_TYPE_57765_CLASS (1 << 4)
+#define	CHIP_TYPE_57765_PLUS  (1 << 5)
+#define	CHIP_TYPE_5717_PLUS   (1 << 6)
+
+#define	DEVICE_IS_57765_PLUS(bgep) \
+	(bgep->chipid.chip_type & CHIP_TYPE_57765_PLUS)
+#define	DEVICE_IS_5755_PLUS(bgep) \
+	(bgep->chipid.chip_type & CHIP_TYPE_5755_PLUS)
+
 /*
  * Second section:
  *	Offsets of important registers & definitions for bits therein
@@ -225,6 +269,7 @@ extern "C" {
  */
 #define	PCI_CONF_BGE_MHCR		0x68
 #define	MHCR_CHIP_REV_MASK		0xffff0000
+#define	MHCR_CHIP_REV_SHIFT		16
 #define	MHCR_ENABLE_TAGGED_STATUS_MODE	0x00000200
 #define	MHCR_MASK_INTERRUPT_MODE	0x00000100
 #define	MHCR_ENABLE_INDIRECT_ACCESS	0x00000080
@@ -236,95 +281,38 @@ extern "C" {
 #define	MHCR_MASK_PCI_INT_OUTPUT	0x00000002
 #define	MHCR_CLEAR_INTERRUPT_INTA	0x00000001
 
-#define	MHCR_CHIP_REV_5700_B0		0x71000000
-#define	MHCR_CHIP_REV_5700_B2		0x71020000
-#define	MHCR_CHIP_REV_5700_B3		0x71030000
-#define	MHCR_CHIP_REV_5700_C0		0x72000000
-#define	MHCR_CHIP_REV_5700_C1		0x72010000
-#define	MHCR_CHIP_REV_5700_C2		0x72020000
-
-#define	MHCR_CHIP_REV_5701_A0		0x00000000
-#define	MHCR_CHIP_REV_5701_A2		0x00020000
-#define	MHCR_CHIP_REV_5701_A3		0x00030000
-#define	MHCR_CHIP_REV_5701_A5		0x01050000
-
-#define	MHCR_CHIP_REV_5702_A0		0x10000000
-#define	MHCR_CHIP_REV_5702_A1		0x10010000
-#define	MHCR_CHIP_REV_5702_A2		0x10020000
-
-#define	MHCR_CHIP_REV_5703_A0		0x10000000
-#define	MHCR_CHIP_REV_5703_A1		0x10010000
-#define	MHCR_CHIP_REV_5703_A2		0x10020000
-#define	MHCR_CHIP_REV_5703_B0		0x11000000
-#define	MHCR_CHIP_REV_5703_B1		0x11010000
-
-#define	MHCR_CHIP_REV_5704_A0		0x20000000
-#define	MHCR_CHIP_REV_5704_A1		0x20010000
-#define	MHCR_CHIP_REV_5704_A2		0x20020000
-#define	MHCR_CHIP_REV_5704_A3		0x20030000
-#define	MHCR_CHIP_REV_5704_B0		0x21000000
-
-#define	MHCR_CHIP_REV_5705_A0		0x30000000
-#define	MHCR_CHIP_REV_5705_A1		0x30010000
-#define	MHCR_CHIP_REV_5705_A2		0x30020000
-#define	MHCR_CHIP_REV_5705_A3		0x30030000
-#define	MHCR_CHIP_REV_5705_A5		0x30050000
-
-#define	MHCR_CHIP_REV_5782_A0		0x30030000
-#define	MHCR_CHIP_REV_5782_A1		0x30030088
-
-#define	MHCR_CHIP_REV_5788_A1		0x30050000
-
-#define	MHCR_CHIP_REV_5751_A0		0x40000000
-#define	MHCR_CHIP_REV_5751_A1		0x40010000
-
-#define	MHCR_CHIP_REV_5721_A0		0x41000000
-#define	MHCR_CHIP_REV_5721_A1		0x41010000
-
-#define	MHCR_CHIP_REV_5714_A0		0x50000000
-#define	MHCR_CHIP_REV_5714_A1		0x90010000
-
-#define	MHCR_CHIP_REV_5715_A0		0x50000000
-#define	MHCR_CHIP_REV_5715_A1		0x90010000
-
-#define	MHCR_CHIP_REV_5715S_A0		0x50000000
-#define	MHCR_CHIP_REV_5715S_A1		0x90010000
-
-#define	MHCR_CHIP_REV_5754_A0		0xb0000000
-#define	MHCR_CHIP_REV_5754_A1		0xb0010000
-
-#define	MHCR_CHIP_REV_5787_A0		0xb0000000
-#define	MHCR_CHIP_REV_5787_A1		0xb0010000
-#define	MHCR_CHIP_REV_5787_A2		0xb0020000
-
-#define	MHCR_CHIP_REV_5755_A0		0xa0000000
-#define	MHCR_CHIP_REV_5755_A1		0xa0010000
-
-#define	MHCR_CHIP_REV_5906_A0		0xc0000000
-#define	MHCR_CHIP_REV_5906_A1		0xc0010000
-#define	MHCR_CHIP_REV_5906_A2		0xc0020000
-
-#define	MHCR_CHIP_REV_5723_A0		0xf0000000
-#define	MHCR_CHIP_REV_5723_A1		0xf0010000
-#define	MHCR_CHIP_REV_5723_A2		0xf0020000
-#define	MHCR_CHIP_REV_5723_B0		0xf1000000
-
-#define	MHCR_CHIP_ASIC_REV(ChipRevId)	((ChipRevId) & 0xf0000000)
-#define	MHCR_CHIP_ASIC_REV_5700		(0x7 << 28)
-#define	MHCR_CHIP_ASIC_REV_5701		(0x0 << 28)
-#define	MHCR_CHIP_ASIC_REV_5703		(0x1 << 28)
-#define	MHCR_CHIP_ASIC_REV_5704		(0x2 << 28)
-#define	MHCR_CHIP_ASIC_REV_5705		(0x3 << 28)
-#define	MHCR_CHIP_ASIC_REV_5721_5751	(0x4 << 28)
-#define	MHCR_CHIP_ASIC_REV_5714 	(0x5 << 28)
-#define	MHCR_CHIP_ASIC_REV_5752		(0x6 << 28)
-#define	MHCR_CHIP_ASIC_REV_5754		(0xb << 28)
-#define	MHCR_CHIP_ASIC_REV_5787		((uint32_t)0xb << 28)
-#define	MHCR_CHIP_ASIC_REV_5755		((uint32_t)0xa << 28)
-#define	MHCR_CHIP_ASIC_REV_5715 	((uint32_t)0x9 << 28)
-#define	MHCR_CHIP_ASIC_REV_5906		((uint32_t)0xc << 28)
-#define	MHCR_CHIP_ASIC_REV_5723		((uint32_t)0xf << 28)
-
+#define	MHCR_CHIP_REV_5703_A0		0x1000
+#define	MHCR_CHIP_REV_5704_A0		0x2000
+#define	MHCR_CHIP_REV_5751_A0		0x4000
+#define	MHCR_CHIP_REV_5721_A0		0x4100
+#define	MHCR_CHIP_REV_5755_A0		0xa000
+#define	MHCR_CHIP_REV_5755_A1		0xa001
+#define	MHCR_CHIP_REV_5719_A0		0x05719000
+#define	MHCR_CHIP_REV_5720_A0		0x05720000
+
+#define	MHCR_CHIP_ASIC_REV(ChipRevId)	((ChipRevId) >> 12)
+#define	MHCR_CHIP_ASIC_REV_5700		0x07
+#define	MHCR_CHIP_ASIC_REV_5701		0x00
+#define	MHCR_CHIP_ASIC_REV_5703		0x01
+#define	MHCR_CHIP_ASIC_REV_5704		0x02
+#define	MHCR_CHIP_ASIC_REV_5705		0x03
+#define	MHCR_CHIP_ASIC_REV_5750		0x04
+#define	MHCR_CHIP_ASIC_REV_5752		0x06
+#define	MHCR_CHIP_ASIC_REV_5780		0x08
+#define	MHCR_CHIP_ASIC_REV_5714		0x09
+#define	MHCR_CHIP_ASIC_REV_5755		0x0a
+#define	MHCR_CHIP_ASIC_REV_5787		0x0b
+#define	MHCR_CHIP_ASIC_REV_5906		0x0c
+#define	MHCR_CHIP_ASIC_REV_PRODID	0x0f
+#define	MHCR_CHIP_ASIC_REV_5784		0x5784
+#define	MHCR_CHIP_ASIC_REV_5761		0x5761
+#define	MHCR_CHIP_ASIC_REV_5785		0x5785
+#define	MHCR_CHIP_ASIC_REV_5717		0x5717
+#define	MHCR_CHIP_ASIC_REV_5719		0x5719
+#define	MHCR_CHIP_ASIC_REV_5720		0x5720
+#define	MHCR_CHIP_ASIC_REV_57780	0x57780
+#define	MHCR_CHIP_ASIC_REV_57765	0x57785
+#define	MHCR_CHIP_ASIC_REV_57766	0x57766
 
 /*
  * PCI DMA read/write Control Register, in PCI config space
@@ -466,6 +454,10 @@ extern "C" {
 #define	PCI_CONF_DEV_STUS_5723		0xd6
 #define	DEVICE_ERROR_STUS		0xf
 
+#define	PCI_CONF_PRODID_ASICREV		0x000000bc
+#define	PCI_CONF_GEN2_PRODID_ASICREV	0x000000f4
+#define	PCI_CONF_GEN15_PRODID_ASICREV	0x000000fc
+
 #define	NIC_MEM_WINDOW_OFFSET		0x00008000	/* 32k	*/
 
 /*
@@ -541,6 +533,7 @@ extern "C" {
 #define	MEMORY_ARBITER_MODE_REG		0x4000
 #define	BUFFER_MANAGER_MODE_REG		0x4400
 #define	READ_DMA_MODE_REG		0x4800
+#define	READ_DMA_RESERVED_CONTROL_REG	0x4900
 #define	WRITE_DMA_MODE_REG		0x4c00
 #define	DMA_COMPLETION_MODE_REG		0x6400
 
@@ -552,6 +545,9 @@ extern "C" {
  * Transmit MAC Mode Register
  * (TRANSMIT_MAC_MODE_REG, 0x045c)
  */
+#define	TRANSMIT_MODE_HTX2B_CNT_DN_MODE 0x00800000
+#define	TRANSMIT_MODE_HTX2B_JMB_FRM_LEN 0x00400000
+#define	TRANSMIT_MODE_MBUF_LOCKUP_FIX	0x00000100
 #define	TRANSMIT_MODE_LONG_PAUSE	0x00000040
 #define	TRANSMIT_MODE_BIG_BACKOFF	0x00000020
 #define	TRANSMIT_MODE_FLOW_CONTROL	0x00000010
@@ -619,12 +615,14 @@ extern "C" {
  */
 #define	BUFF_MGR_TEST_MODE		0x00000008
 #define	BUFF_MGR_MBUF_LOW_ATTN_ENABLE	0x00000010
+#define	BUFF_MGR_NO_TX_UNDERRUN		0x80000000
 
 #define	BUFF_MGR_ALL_ATTN_BITS		0x00000014
 
 /*
  * Read and Write DMA Mode Registers (READ_DMA_MODE_REG,
- * 0x4800 and WRITE_DMA_MODE_REG, 0x4c00)
+ * 0x4800, READ_DMA_RESERVED_CONTROL_REG, 0x4900,
+ * WRITE_DMA_MODE_REG, 0x4c00)
  *
  * These registers each contain a 2-bit priority field, which controls
  * the relative priority of that type of DMA (read vs. write vs. MSI),
@@ -635,6 +633,15 @@ extern "C" {
 #define	DMA_PRIORITY_SHIFT		30
 #define	ALL_DMA_ATTN_BITS		0x000003fc
 
+#define	RDMA_RSRVCTRL_FIFO_OFLW_FIX	 0x00000004
+#define	RDMA_RSRVCTRL_FIFO_LWM_1_5K	 0x00000c00
+#define	RDMA_RSRVCTRL_FIFO_LWM_MASK	 0x00000ff0
+#define	RDMA_RSRVCTRL_FIFO_HWM_1_5K	 0x000c0000
+#define	RDMA_RSRVCTRL_FIFO_HWM_MASK	 0x000ff000
+#define	RDMA_RSRVCTRL_TXMRGN_320B	 0x28000000
+#define	RDMA_RSRVCTRL_TXMRGN_MASK	 0xffe00000
+
+
 /*
  * BCM5755, 5755M, 5906, 5906M only
  * 1 - Enable Fix. Device will send out the status block before
@@ -644,6 +651,10 @@ extern "C" {
  */
 #define	DMA_STATUS_TAG_FIX_CQ12384	0x20000000
 
+/* 5720 only */
+#define	DMA_H2BNC_VLAN_DET		0x20000000
+
+
 /*
  * End of state machine control register definitions
  */
@@ -781,6 +792,8 @@ extern "C" {
 #define	MAC_RX_MTU_DEFAULT		0x000005f2	/* 1522	*/
 #define	MAC_TX_LENGTHS_REG		0x0464
 #define	MAC_TX_LENGTHS_DEFAULT		0x00002620
+#define	MAC_TX_LENGTHS_JMB_FRM_LEN_MSK	 0x00ff0000
+#define	MAC_TX_LENGTHS_CNT_DWN_VAL_MSK	 0xff000000
 
 /*
  * MII access registers
@@ -1069,10 +1082,16 @@ extern "C" {
 #define	JUMBO_RCV_BD_REPLENISH_DEFAULT	0x00000020	/* 32	*/
 
 /*
- * CPMU registers (5717/5718 only)
+ * CPMU registers (5717/5718/5719/5720 only)
  */
-#define	CPMU_STATUS_REG	0x362c
-#define	CPMU_STATUS_FUN_NUM	0x20000000
+#define	CPMU_CLCK_ORIDE_REG		0x3624
+#define	CPMU_CLCK_ORIDE_MAC_ORIDE_EN	0x80000000
+
+#define	CPMU_STATUS_REG			0x362c
+#define	CPMU_STATUS_FUN_NUM_5717	0x20000000
+#define	CPMU_STATUS_FUN_NUM_5719	0xc0000000
+#define	CPMU_STATUS_FUN_NUM_5719_SHIFT	30
+
 
 /*
  * Host Coalescing Engine Control Registers
@@ -1191,6 +1210,8 @@ extern "C" {
 #define	VCPU_EXT_CTL			0x6890
 #define	VCPU_EXT_CTL_HALF		0x00400000
 
+#define	GRC_FASTBOOT_PC			0x6894
+
 #define	FTQ_RESET_REG			0x5c00
 
 #define	MSI_MODE_REG			0x6000
@@ -1210,14 +1231,18 @@ extern "C" {
 #define	MODE_INT_ON_TXRISC_ATTN		0x01000000
 #define	MODE_RECV_NO_PSEUDO_HDR_CSUM	0x00800000
 #define	MODE_SEND_NO_PSEUDO_HDR_CSUM	0x00100000
+#define	MODE_HTX2B_ENABLE		0x00040000
 #define	MODE_HOST_SEND_BDS		0x00020000
 #define	MODE_HOST_STACK_UP		0x00010000
 #define	MODE_FORCE_32_BIT_PCI		0x00008000
+#define	MODE_B2HRX_ENABLE		0x00008000
 #define	MODE_NO_INT_ON_RECV		0x00004000
 #define	MODE_NO_INT_ON_SEND		0x00002000
 #define	MODE_ALLOW_BAD_FRAMES		0x00000800
 #define	MODE_NO_CRC			0x00000400
 #define	MODE_NO_FRAME_CRACKING		0x00000200
+#define	MODE_WORD_SWAP_B2HRX_DATA	0x00000080
+#define	MODE_BYTE_SWAP_B2HRX_DATA	0x00000040
 #define	MODE_WORD_SWAP_FRAME		0x00000020
 #define	MODE_BYTE_SWAP_FRAME		0x00000010
 #define	MODE_WORD_SWAP_NONFRAME		0x00000004
@@ -1246,7 +1271,7 @@ extern "C" {
  */
 #define	CORE_CLOCK_MHZ			66
 #define	MISC_CONFIG_REG			0x6804
-#define	MISC_CONFIG_GRC_RESET_DISABLE   0x20000000
+#define	MISC_CONFIG_GRC_RESET_DISABLE	0x20000000
 #define	MISC_CONFIG_GPHY_POWERDOWN_OVERRIDE 0x04000000
 #define	MISC_CONFIG_POWERDOWN		0x00100000
 #define	MISC_CONFIG_POWER_STATE		0x00060000
@@ -1567,6 +1592,7 @@ extern "C" {
 #define	BGE_MINI_SLOTS_MAX		1024
 #define	BGE_RECV_SLOTS_MAX		2048
 #define	BGE_RECV_SLOTS_5705		512
+#define	BGE_RECV_SLOTS_5717		1024
 #define	BGE_RECV_SLOTS_5782		512
 #define	BGE_RECV_SLOTS_5721		512
 
diff --git a/usr/src/uts/common/io/bge/bge_impl.h b/usr/src/uts/common/io/bge/bge_impl.h
index 772c989092..0c51c2bc8e 100644
--- a/usr/src/uts/common/io/bge/bge_impl.h
+++ b/usr/src/uts/common/io/bge/bge_impl.h
@@ -23,6 +23,10 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
+ */
+
 #ifndef _BGE_IMPL_H
 #define	_BGE_IMPL_H
 
@@ -605,6 +609,7 @@ typedef struct {
 	uint8_t			latency;	/* latency-timer	*/
 
 	uint8_t			flags;
+	uint32_t		chip_type;	/* see CHIP_TYPE_ in bge_hw.h */
 	uint16_t		chip_label;	/* numeric part only	*/
 						/* (e.g. 5703/5794/etc)	*/
 	uint32_t		mbuf_base;	/* Mbuf pool parameters */
@@ -640,10 +645,11 @@ typedef struct {
 	uint32_t		mask_pci_int;
 } chip_id_t;
 
-#define	CHIP_FLAG_SUPPORTED	0x80
-#define	CHIP_FLAG_SERDES	0x40
-#define	CHIP_FLAG_PARTIAL_CSUM	0x20
-#define	CHIP_FLAG_NO_JUMBO	0x1
+#define	CHIP_FLAG_SUPPORTED	 0x80
+#define	CHIP_FLAG_SERDES	 0x40
+#define	CHIP_FLAG_PARTIAL_CSUM	 0x20
+#define	CHIP_FLAG_NO_CHECK_RESET 0x2
+#define	CHIP_FLAG_NO_JUMBO	 0x1
 
 /*
  * Collection of physical-layer functions to:
diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c
index f191f313c0..d0f309730d 100644
--- a/usr/src/uts/common/io/bge/bge_main2.c
+++ b/usr/src/uts/common/io/bge/bge_main2.c
@@ -23,6 +23,10 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
+ */
+
 #include "bge_impl.h"
 #include <sys/sdt.h>
 #include <sys/mac_provider.h>
@@ -3211,13 +3215,17 @@ bge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 	 */
 	if (DEVICE_5717_SERIES_CHIPSETS(bgep))
 		pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, 0);
+#else
+	mhcrValue = MHCR_ENABLE_INDIRECT_ACCESS |
+	    MHCR_ENABLE_TAGGED_STATUS_MODE |
+	    MHCR_MASK_INTERRUPT_MODE |
+	    MHCR_MASK_PCI_INT_OUTPUT |
+	    MHCR_CLEAR_INTERRUPT_INTA;
+#endif
 	pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, mhcrValue);
 	bge_ind_put32(bgep, MEMORY_ARBITER_MODE_REG,
 	    bge_ind_get32(bgep, MEMORY_ARBITER_MODE_REG) |
 	    MEMORY_ARBITER_ENABLE);
-#else
-	mhcrValue = pci_config_get32(bgep->cfg_handle, PCI_CONF_BGE_MHCR);
-#endif
 	if (mhcrValue & MHCR_ENABLE_ENDIAN_WORD_SWAP) {
 		bgep->asf_wordswapped = B_TRUE;
 	} else {
diff --git a/usr/src/uts/common/io/bge/bge_mii.c b/usr/src/uts/common/io/bge/bge_mii.c
index f24b6a3f16..b47c043d8c 100644
--- a/usr/src/uts/common/io/bge/bge_mii.c
+++ b/usr/src/uts/common/io/bge/bge_mii.c
@@ -23,6 +23,10 @@
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
+/*
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
+ */
+
 #include "bge_impl.h"
 
 /*
@@ -207,6 +211,7 @@ bge_phy_reset(bge_t *bgep)
 {
 	uint16_t control;
 	uint_t count;
+	boolean_t ret = B_FALSE;
 
 	BGE_TRACE(("bge_phy_reset($%p)", (void *)bgep));
 
@@ -221,22 +226,26 @@ bge_phy_reset(bge_t *bgep)
 	}
 
 	/*
-	 * Set the PHY RESET bit, then wait up to 5 ms for it to self-clear
+	 * Set the PHY RESET bit, then wait up to 50 ms for it to self-clear
 	 */
 	bge_mii_put16(bgep, MII_CONTROL, MII_CONTROL_RESET);
-	for (count = 0; ++count < 1000; ) {
-		drv_usecwait(5);
+	for (count = 0; ++count < 5000; ) {
 		control = bge_mii_get16(bgep, MII_CONTROL);
-		if (BIC(control, MII_CONTROL_RESET))
-			return (B_TRUE);
+		if (BIC(control, MII_CONTROL_RESET)) {
+			drv_usecwait(40);
+			ret = B_TRUE;
+			break;
+		}
+		drv_usecwait(10);
 	}
 
-	if (DEVICE_5906_SERIES_CHIPSETS(bgep))
+	if (ret == B_TRUE && DEVICE_5906_SERIES_CHIPSETS(bgep))
 		(void) bge_adj_volt_5906(bgep);
 
-	BGE_DEBUG(("bge_phy_reset: FAILED, control now 0x%x", control));
+	if (ret == B_FALSE)
+		BGE_DEBUG(("bge_phy_reset: FAILED, control now 0x%x", control));
 
-	return (B_FALSE);
+	return (ret);
 }
 
 /*
@@ -541,34 +550,14 @@ bge_restart_copper(bge_t *bgep, boolean_t powerdown)
 
 	ASSERT(mutex_owned(bgep->genlock));
 
-	switch (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev)) {
-	default:
-		/*
-		 * Shouldn't happen; it means we don't recognise this chip.
-		 * It's probably a new one, so we'll try our best anyway ...
-		 */
-	case MHCR_CHIP_ASIC_REV_5703:
-	case MHCR_CHIP_ASIC_REV_5704:
-	case MHCR_CHIP_ASIC_REV_5705:
-	case MHCR_CHIP_ASIC_REV_5752:
-	case MHCR_CHIP_ASIC_REV_5714:
-	case MHCR_CHIP_ASIC_REV_5715:
-		reset_ok = bge_phy_reset_and_check(bgep);
-		break;
-
-	case MHCR_CHIP_ASIC_REV_5906:
-	case MHCR_CHIP_ASIC_REV_5700:
-	case MHCR_CHIP_ASIC_REV_5701:
-	case MHCR_CHIP_ASIC_REV_5723:
-	case MHCR_CHIP_ASIC_REV_5721_5751:
-		/*
-		 * Just a plain reset; the "check" code breaks these chips
-		 */
+	if (bgep->chipid.flags & CHIP_FLAG_NO_CHECK_RESET) {
 		reset_ok = bge_phy_reset(bgep);
 		if (!reset_ok)
 			bge_fm_ereport(bgep, DDI_FM_DEVICE_NO_RESPONSE);
-		break;
+	} else {
+		reset_ok = bge_phy_reset_and_check(bgep);
 	}
+
 	if (!reset_ok) {
 		BGE_REPORT((bgep, "PHY failed to reset correctly"));
 		return (DDI_FAILURE);
@@ -590,7 +579,7 @@ bge_restart_copper(bge_t *bgep, boolean_t powerdown)
 
 	switch (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev)) {
 	case MHCR_CHIP_ASIC_REV_5705:
-	case MHCR_CHIP_ASIC_REV_5721_5751:
+	case MHCR_CHIP_ASIC_REV_5750:
 		bge_phy_bit_err_fix(bgep);
 		break;
 	}
@@ -1507,14 +1496,22 @@ bge_phys_init(bge_t *bgep)
 	 */
 	bgep->phy_mii_addr = 1;
 	if (DEVICE_5717_SERIES_CHIPSETS(bgep)) {
-		int regval = bge_reg_get32(bgep, CPMU_STATUS_REG);
-		if (regval & CPMU_STATUS_FUN_NUM)
-			bgep->phy_mii_addr += 1;
+		uint32_t regval = bge_reg_get32(bgep, CPMU_STATUS_REG);
+		if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+		    MHCR_CHIP_ASIC_REV_5719 ||
+		    MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) ==
+		    MHCR_CHIP_ASIC_REV_5720) {
+			bgep->phy_mii_addr +=
+			    (regval & CPMU_STATUS_FUN_NUM_5719) >>
+			    CPMU_STATUS_FUN_NUM_5719_SHIFT;
+		} else {
+			bgep->phy_mii_addr +=
+			    (regval & CPMU_STATUS_FUN_NUM_5717) ? 1 : 0;
+		}
 		regval = bge_reg_get32(bgep, SGMII_STATUS_REG);
 		if (regval & MEDIA_SELECTION_MODE)
 			bgep->phy_mii_addr += 7;
 	}
-
 	if (bge_phy_probe(bgep)) {
 		bgep->chipid.flags &= ~CHIP_FLAG_SERDES;
 		bgep->physops = &copper_ops;
diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c
index 8af4d1d6a5..20e3a5737e 100644
--- a/usr/src/uts/common/io/blkdev/blkdev.c
+++ b/usr/src/uts/common/io/blkdev/blkdev.c
@@ -85,6 +85,7 @@ struct bd {
 	kstat_io_t	*d_kiop;
 
 	boolean_t	d_rdonly;
+	boolean_t	d_ssd;
 	boolean_t	d_removable;
 	boolean_t	d_hotpluggable;
 	boolean_t	d_use_dma;
@@ -1103,6 +1104,14 @@ bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp)
 		}
 		return (0);
 	}
+	case DKIOCSOLIDSTATE: {
+		int i;
+		i = bd->d_ssd ? 1 : 0;
+		if (ddi_copyout(&i, ptr, sizeof (i), flag)) {
+			return (EFAULT);
+		}
+		return (0);
+	}
 	case DKIOCSTATE: {
 		enum dkio_state	state;
 		if (ddi_copyin(ptr, &state, sizeof (state), flag)) {
@@ -1246,6 +1255,7 @@ bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
 		bd_update_state(bd);
 		((tg_attribute_t *)arg)->media_is_writable =
 		    bd->d_rdonly ? B_FALSE : B_TRUE;
+		((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd;
 		return (0);
 
 	default:
@@ -1361,6 +1371,7 @@ bd_update_state(bd_t *bd)
 				bd->d_blkshift = ddi_ffs(media.m_blksize) - 1;
 				bd->d_numblks = media.m_nblks;
 				bd->d_rdonly = media.m_readonly;
+				bd->d_ssd = media.m_solidstate;
 				state = DKIO_INSERTED;
 			}
 
diff --git a/usr/src/uts/common/io/cmlb.c b/usr/src/uts/common/io/cmlb.c
index 0d174501f5..d7d6cb5ab5 100644
--- a/usr/src/uts/common/io/cmlb.c
+++ b/usr/src/uts/common/io/cmlb.c
@@ -20,6 +20,7 @@
  */
 
 /*
+ * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -243,6 +244,7 @@ static i_ddi_prop_dyn_t cmlb_prop_dyn[] = {
 	{"Size",		DDI_PROP_TYPE_INT64,	S_IFCHR},
 	{"device-nblocks",	DDI_PROP_TYPE_INT64},
 	{"device-blksize",	DDI_PROP_TYPE_INT},
+	{"device-solid-state",	DDI_PROP_TYPE_INT},
 	{NULL}
 };
 
@@ -5657,11 +5659,12 @@ cmlb_prop_op(cmlb_handle_t cmlbhandle,
 	struct cmlb_lun	*cl;
 	diskaddr_t	capacity;
 	uint32_t	lbasize;
-	enum		dp { DP_NBLOCKS, DP_BLKSIZE } dp;
+	enum		dp { DP_NBLOCKS, DP_BLKSIZE, DP_SSD } dp;
 	int		callers_length;
 	caddr_t		buffer;
 	uint64_t	nblocks64;
 	uint_t		dblk;
+	tg_attribute_t	tgattr;
 
 	/* Always fallback to ddi_prop_op... */
 	cl = (struct cmlb_lun *)cmlbhandle;
@@ -5685,6 +5688,8 @@ fallback:	return (ddi_prop_op(dev, dip, prop_op, mod_flags,
 			dp = DP_NBLOCKS;
 		else if (strcmp(name, "device-blksize") == 0)
 			dp = DP_BLKSIZE;
+		else if (strcmp(name, "device-solid-state") == 0)
+			dp = DP_SSD;
 		else
 			goto fallback;
 
@@ -5692,7 +5697,7 @@ fallback:	return (ddi_prop_op(dev, dip, prop_op, mod_flags,
 		callers_length = *lengthp;
 		if (dp == DP_NBLOCKS)
 			*lengthp = sizeof (uint64_t);
-		else if (dp == DP_BLKSIZE)
+		else if ((dp == DP_BLKSIZE) || (dp == DP_SSD))
 			*lengthp = sizeof (uint32_t);
 
 		/* service request for the length of the property */
@@ -5720,11 +5725,19 @@ fallback:	return (ddi_prop_op(dev, dip, prop_op, mod_flags,
 		}
 
 		/* transfer the value into the buffer */
-		if (dp == DP_NBLOCKS)
+		switch (dp) {
+		case DP_NBLOCKS:
 			*((uint64_t *)buffer) = capacity;
-		else if (dp == DP_BLKSIZE)
+			break;
+		case DP_BLKSIZE:
 			*((uint32_t *)buffer) = lbasize;
-
+			break;
+		case DP_SSD:
+			if (DK_TG_GETATTRIBUTE(cl, &tgattr, tg_cookie) != 0)
+				tgattr.media_is_solid_state = B_FALSE;
+			*((uint32_t *)buffer) =
+			    tgattr.media_is_solid_state ? 1 : 0;
+		}
 		return (DDI_PROP_SUCCESS);
 	}
 
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index 40cbe86170..2152ce0baa 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
  */
 
 /*
@@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
 				err = EACCES;
 				goto done;
 			}
-			err = dls_devnet_setzid(dlh, dzp->diz_zid);
+			err = dls_devnet_setzid(dlh, dzp->diz_zid,
+			    dzp->diz_transient);
 		} else {
 			kprop->pr_perm_flags = MAC_PROP_PERM_RW;
 			(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
 		return (err);
 
 	if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
-	    dir->dir_link)) != 0)
+	    dir->dir_link, dir->dir_zoneinit)) != 0)
 		return (err);
 
 	if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index f90adbf27a..d35c1e4bbf 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
+ */
+
+/*
  * Data-Link Services Module
  */
 
@@ -610,6 +614,22 @@ boolean_t
 dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
     void **ds_rx_arg, boolean_t loopback)
 {
+	if (dsp->ds_promisc == 0) {
+		/*
+		 * If there are active walkers of the mi_promisc_list when
+		 * promiscuousness is disabled, ds_promisc will be cleared,
+		 * but the DLS will remain on the mi_promisc_list until the
+		 * walk is completed.  If we do not recognize this case here,
+		 * we won't properly execute the ds_promisc case in the common
+		 * accept routine -- and we will potentially accept a packet
+		 * that has originated with this DLS (which in turn can
+		 * induce recursion and death by stack overflow).  If
+		 * ds_promisc is zero, we know that we are in this window --
+		 * and we refuse to accept the packet.
+		 */
+		return (B_FALSE);
+	}
+
 	return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
 	    loopback));
 }
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 049c4bd757..5fa37e0a8a 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -105,12 +106,13 @@ typedef struct dls_devnet_s {
 	zoneid_t	dd_zid;		/* current zone */
 	boolean_t	dd_prop_loaded;
 	taskqid_t	dd_prop_taskid;
+	boolean_t	dd_transient;	/* link goes away when zone does */
 } dls_devnet_t;
 
 static int i_dls_devnet_create_iptun(const char *, const char *,
     datalink_id_t *);
 static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
 static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
 
 /*ARGSUSED*/
@@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
 	dls_devnet_t *ddp;
 
 	if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
-		(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+		/*
+		 * Don't bother moving transient links back to the global zone
+		 * since we will simply delete them in dls_devnet_unset.
+		 */
+		if (!ddp->dd_transient)
+			(void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
 		dls_devnet_rele_tmp(ddp);
 	}
 	return (0);
@@ -526,6 +533,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
 
 	getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
 	(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+	getlinkid.ld_zoneid = getzoneid();
 
 	if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
 	    sizeof (retval))) == 0) {
@@ -740,12 +748,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
  * Create the "link" kstats.
  */
 static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
 {
 	kstat_t	*ksp;
+	char	*nm;
+	char	kname[MAXLINKNAMELEN];
+
+	if (zoneid != newzoneid) {
+		ASSERT(zoneid == GLOBAL_ZONEID);
+		(void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+		    ddp->dd_linkname);
+		nm = kname;
+	} else {
+		nm = ddp->dd_linkname;
+	}
 
-	if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
-	    dls_devnet_stat_update, ddp, &ksp) == 0) {
+	if (dls_stat_create("link", 0, nm, zoneid,
+	    dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) {
 		ASSERT(ksp != NULL);
 		if (zoneid == ddp->dd_owner_zid) {
 			ASSERT(ddp->dd_ksp == NULL);
@@ -765,12 +784,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
 {
 	if (zoneid == ddp->dd_owner_zid) {
 		if (ddp->dd_ksp != NULL) {
-			kstat_delete(ddp->dd_ksp);
+			dls_stat_delete(ddp->dd_ksp);
 			ddp->dd_ksp = NULL;
 		}
 	} else {
 		if (ddp->dd_zone_ksp != NULL) {
-			kstat_delete(ddp->dd_zone_ksp);
+			dls_stat_delete(ddp->dd_zone_ksp);
 			ddp->dd_zone_ksp = NULL;
 		}
 	}
@@ -781,15 +800,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
  * and create the new set using the new name.
  */
 static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
 {
 	if (ddp->dd_ksp != NULL) {
-		kstat_delete(ddp->dd_ksp);
+		dls_stat_delete(ddp->dd_ksp);
 		ddp->dd_ksp = NULL;
 	}
-	/* We can't rename a link while it's assigned to a non-global zone. */
+	if (zoneinit && ddp->dd_zone_ksp != NULL) {
+		dls_stat_delete(ddp->dd_zone_ksp);
+		ddp->dd_zone_ksp = NULL;
+	}
+	/*
+	 * We can't rename a link while it's assigned to a non-global zone
+	 * unless we're first initializing the zone while readying it.
+	 */
 	ASSERT(ddp->dd_zone_ksp == NULL);
-	dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+	dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+	    (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+	if (zoneinit)
+		dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
 }
 
 /*
@@ -878,7 +907,8 @@ done:
 	rw_exit(&i_dls_devnet_lock);
 	if (err == 0) {
 		if (zoneid != GLOBAL_ZONEID &&
-		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
+		    (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+		    B_FALSE)) != 0)
 			(void) dls_devnet_unset(macname, &linkid, B_TRUE);
 		/*
 		 * The kstat subsystem holds its own locks (rather perimeter)
@@ -887,7 +917,7 @@ done:
 		 * lock hierarchy is kstat locks -> i_dls_devnet_lock.
 		 */
 		if (stat_create)
-			dls_devnet_stat_create(ddp, zoneid);
+			dls_devnet_stat_create(ddp, zoneid, zoneid);
 		if (ddpp != NULL)
 			*ddpp = ddp;
 	}
@@ -924,17 +954,64 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
 	ASSERT(ddp->dd_ref != 0);
 	if ((ddp->dd_ref != 1) || (!wait &&
 	    (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
-		mutex_exit(&ddp->dd_mutex);
-		rw_exit(&i_dls_devnet_lock);
-		return (EBUSY);
+		int zstatus = 0;
+
+		/*
+		 * There are a couple of alternatives that might be going on
+		 * here; a) the zone is shutting down and it has a transient
+		 * link assigned, in which case we want to clean it up instead
+		 * of moving it back to the global zone, or b) its possible
+		 * that we're trying to clean up an orphaned vnic that was
+		 * delegated to a zone and which wasn't cleaned up properly
+		 * when the zone went away.  Check for either of these cases
+		 * before we simply return EBUSY.
+		 *
+		 * zstatus indicates which situation we are dealing with:
+		 *	 0 - means return EBUSY
+		 *	 1 - means case (a), cleanup transient link
+		 *	-1 - means case (b), orphained VNIC
+		 */
+		if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+			zone_t	*zp;
+
+			if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+				zstatus = -1;
+			} else {
+				if (ddp->dd_transient) {
+					zone_status_t s = zone_status_get(zp);
+
+					if (s >= ZONE_IS_SHUTTING_DOWN)
+						zstatus = 1;
+				}
+				zone_rele(zp);
+			}
+		}
+
+		if (zstatus == 0) {
+			mutex_exit(&ddp->dd_mutex);
+			rw_exit(&i_dls_devnet_lock);
+			return (EBUSY);
+		}
+
+		/*
+		 * We want to delete the link, reset ref to 1;
+		 */
+		if (zstatus == -1)
+			/* Log a warning, but continue in this case */
+			cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+			    ddp->dd_linkname);
+		ddp->dd_ref = 1;
 	}
 
 	ddp->dd_flags |= DD_CONDEMNED;
 	ddp->dd_ref--;
 	*id = ddp->dd_linkid;
 
-	if (ddp->dd_zid != GLOBAL_ZONEID)
-		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
+	if (ddp->dd_zid != GLOBAL_ZONEID) {
+		dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+		(void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+		    B_FALSE);
+	}
 
 	/*
 	 * Remove this dls_devnet_t from the hash table.
@@ -1261,9 +1338,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
  *
  *    This case does not change the <link name, linkid> mapping, so the link's
  *    kstats need to be updated with using name associated the given id2.
+ *
+ * The zonename parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone.  Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
  */
 int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+    boolean_t zoneinit)
 {
 	dls_dev_handle_t	ddh = NULL;
 	int			err = 0;
@@ -1313,13 +1396,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 	 * is currently accessing the link kstats, or if the link is on-loan
 	 * to a non-global zone. Then set the DD_KSTAT_CHANGING flag to
 	 * prevent any access to the kstats while we delete and recreate
-	 * kstats below.
+	 * kstats below.  However, we skip this check if we're renaming the
+	 * vnic as part of bringing it up for a zone.
 	 */
 	mutex_enter(&ddp->dd_mutex);
-	if (ddp->dd_ref > 1) {
-		mutex_exit(&ddp->dd_mutex);
-		err = EBUSY;
-		goto done;
+	if (!zoneinit) {
+		if (ddp->dd_ref > 1) {
+			mutex_exit(&ddp->dd_mutex);
+			err = EBUSY;
+			goto done;
+		}
 	}
 
 	ddp->dd_flags |= DD_KSTAT_CHANGING;
@@ -1333,7 +1419,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
 		/* rename mac client name and its flow if exists */
 		if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
 			goto done;
-		(void) mac_rename_primary(mh, link);
+		if (zoneinit) {
+			char tname[MAXLINKNAMELEN];
+
+			(void) snprintf(tname, sizeof (tname), "z%d_%s",
+			    ddp->dd_zid, link);
+			(void) mac_rename_primary(mh, tname);
+		} else {
+			(void) mac_rename_primary(mh, link);
+		}
 		mac_close(mh);
 		goto done;
 	}
@@ -1406,7 +1500,7 @@ done:
 	 */
 	rw_exit(&i_dls_devnet_lock);
 	if (err == 0)
-		dls_devnet_stat_rename(ddp);
+		dls_devnet_stat_rename(ddp, zoneinit);
 
 	if (clear_dd_flag) {
 		mutex_enter(&ddp->dd_mutex);
@@ -1421,7 +1515,8 @@ done:
 }
 
 static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+    boolean_t transient)
 {
 	int			err;
 	mac_perim_handle_t	mph;
@@ -1454,6 +1549,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
 	}
 	if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
 		ddp->dd_zid = new_zoneid;
+		ddp->dd_transient = transient;
 		devnet_need_rebuild = B_TRUE;
 	}
 
@@ -1468,7 +1564,7 @@ done:
 }
 
 int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
 {
 	dls_devnet_t	*ddp;
 	int		err;
@@ -1490,7 +1586,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 		refheld = B_TRUE;
 	}
 
-	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+	if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
 		if (refheld)
 			dls_devnet_rele(ddp);
 		return (err);
@@ -1507,7 +1603,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
 	if (old_zid != GLOBAL_ZONEID)
 		dls_devnet_stat_destroy(ddh, old_zid);
 	if (new_zid != GLOBAL_ZONEID)
-		dls_devnet_stat_create(ddh, new_zid);
+		dls_devnet_stat_create(ddh, new_zid, new_zid);
 
 	return (0);
 }
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -30,30 +31,33 @@
 #include <sys/dld_impl.h>
 #include <sys/mac_ether.h>
 
-static mac_stat_info_t	i_dls_si[] = {
-	{ MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
-	{ MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
-	{ MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
-	    (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define	STAT_INFO_COUNT	(sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+	kstat_named_t	dk_ifspeed;
+	kstat_named_t	dk_multircv;
+	kstat_named_t	dk_brdcstrcv;
+	kstat_named_t	dk_multixmt;
+	kstat_named_t	dk_brdcstxmt;
+	kstat_named_t	dk_norcvbuf;
+	kstat_named_t	dk_ierrors;
+	kstat_named_t	dk_noxmtbuf;
+	kstat_named_t	dk_oerrors;
+	kstat_named_t	dk_collisions;
+	kstat_named_t	dk_rbytes;
+	kstat_named_t	dk_ipackets;
+	kstat_named_t	dk_obytes;
+	kstat_named_t	dk_opackets;
+	kstat_named_t	dk_rbytes64;
+	kstat_named_t	dk_ipackets64;
+	kstat_named_t	dk_obytes64;
+	kstat_named_t	dk_opackets64;
+	kstat_named_t	dk_link_state;
+	kstat_named_t	dk_link_duplex;
+	kstat_named_t	dk_unknowns;
+	kstat_named_t	dk_zonename;
+} dls_kstat_t;
 
 /*
  * Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t	i_dls_si[] = {
 int
 dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 {
-	kstat_named_t	*knp;
-	uint_t		i;
-	uint64_t	val;
+	dls_kstat_t *dkp = ksp->ks_data;
 
 	if (rw != KSTAT_READ)
 		return (EACCES);
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
-		switch (i_dls_si[i].msi_type) {
-		case KSTAT_DATA_UINT64:
-			knp->value.ui64 = val;
-			break;
-		case KSTAT_DATA_UINT32:
-			knp->value.ui32 = (uint32_t)val;
-			break;
-		default:
-			ASSERT(B_FALSE);
-		}
-
-		knp++;
-	}
+	dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+	dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIRCV);
+	dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTRCV);
+	dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_MULTIXMT);
+	dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_BRDCSTXMT);
+	dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NORCVBUF);
+	dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+	dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_NOXMTBUF);
+	dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+	dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_COLLISIONS);
+	dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+	dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_IPACKETS);
+	dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+	dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_OPACKETS);
+	dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+	    MAC_STAT_LINK_STATE);
 
 	/*
 	 * Ethernet specific kstat "link_duplex"
 	 */
 	if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
-		knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+		dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
 	} else {
-		val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
-		knp->value.ui32 = (uint32_t)val;
+		dkp->dk_link_duplex.value.ui32 =
+		    (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
 	}
-	knp++;
-	knp->value.ui32 = dlp->dl_unknowns;
+
+	dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
 
 	return (0);
 }
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
 int
 dls_stat_create(const char *module, int instance, const char *name,
     zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
-    kstat_t **kspp)
+    kstat_t **kspp, zoneid_t newzoneid)
 {
 	kstat_t		*ksp;
-	kstat_named_t	*knp;
-	uint_t		i;
+	zone_t		*zone;
+	dls_kstat_t	*dkp;
 
 	if ((ksp = kstat_create_zone(module, instance, name, "net",
-	    KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+	    KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
 		return (EINVAL);
 	}
 
 	ksp->ks_update = update;
 	ksp->ks_private = private;
+	dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+	if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+		ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	}
 
-	knp = (kstat_named_t *)ksp->ks_data;
-	for (i = 0; i < STAT_INFO_COUNT; i++) {
-		kstat_named_init(knp, i_dls_si[i].msi_name,
-		    i_dls_si[i].msi_type);
-		knp++;
+	kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+	kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+		    KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+	kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+	if (zone != NULL) {
+		kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+		zone_rele(zone);
 	}
 
-	kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
-	kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
 	kstat_install(ksp);
 	*kspp = ksp;
 	return (0);
 }
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+	void *data;
+	if (ksp != NULL) {
+		data = ksp->ks_data;
+		kstat_delete(ksp);
+		kmem_free(data, sizeof (dls_kstat_t));
+	}
+}
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..00aefb6f51
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
@@ -0,0 +1,32 @@
+/*
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..ac6d2d1b15
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+DR_SAS DRIVER
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c
new file mode 100644
index 0000000000..5b1dc82938
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.c
@@ -0,0 +1,5506 @@
+/*
+ * dr_sas.c: source for dr_sas driver
+ *
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Arun Chandrashekhar
+ *		Manju R
+ *        	Rajesh Prabhakaran
+ *        	Seokmann Ju
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/pci.h>
+#include <sys/scsi/scsi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/signal.h>
+#include <sys/fs/dv_node.h>	/* devfs_clean */
+
+#include "dr_sas.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+/*
+ * Local static data
+ */
+static void	*drsas_state = NULL;
+static int 	debug_level_g = CL_NONE;
+
+#pragma weak scsi_hba_open
+#pragma weak scsi_hba_close
+#pragma weak scsi_hba_ioctl
+
+static ddi_dma_attr_t drsas_generic_dma_attr = {
+	DMA_ATTR_V0,		/* dma_attr_version */
+	0,			/* low DMA address range */
+	0xFFFFFFFFU,		/* high DMA address range */
+	0xFFFFFFFFU,		/* DMA counter register  */
+	8,			/* DMA address alignment */
+	0x07,			/* DMA burstsizes  */
+	1,			/* min DMA size */
+	0xFFFFFFFFU,		/* max DMA size */
+	0xFFFFFFFFU,		/* segment boundary */
+	DRSAS_MAX_SGE_CNT,	/* dma_attr_sglen */
+	512,			/* granularity of device */
+	0			/* bus specific DMA flags */
+};
+
+int32_t drsas_max_cap_maxxfer = 0x1000000;
+
+/*
+ * cb_ops contains base level routines
+ */
+static struct cb_ops drsas_cb_ops = {
+	drsas_open,		/* open */
+	drsas_close,		/* close */
+	nodev,			/* strategy */
+	nodev,			/* print */
+	nodev,			/* dump */
+	nodev,			/* read */
+	nodev,			/* write */
+	drsas_ioctl,		/* ioctl */
+	nodev,			/* devmap */
+	nodev,			/* mmap */
+	nodev,			/* segmap */
+	nochpoll,		/* poll */
+	nodev,			/* cb_prop_op */
+	0,			/* streamtab  */
+	D_NEW | D_HOTPLUG,	/* cb_flag */
+	CB_REV,			/* cb_rev */
+	nodev,			/* cb_aread */
+	nodev			/* cb_awrite */
+};
+
+/*
+ * dev_ops contains configuration routines
+ */
+static struct dev_ops drsas_ops = {
+	DEVO_REV,		/* rev, */
+	0,			/* refcnt */
+	drsas_getinfo,		/* getinfo */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	drsas_attach,		/* attach */
+	drsas_detach,		/* detach */
+	drsas_reset,		/* reset */
+	&drsas_cb_ops,		/* char/block ops */
+	NULL,			/* bus ops */
+	NULL,			/* power */
+	ddi_quiesce_not_supported,		/* quiesce */
+};
+
+char _depends_on[] = "misc/scsi";
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* module type - driver */
+	DRSAS_VERSION,
+	&drsas_ops,		/* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,	/* ml_rev - must be MODREV_1 */
+	&modldrv,	/* ml_linkage */
+	NULL		/* end of driver linkage */
+};
+
+static struct ddi_device_acc_attr endian_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_STRUCTURE_LE_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *         common entry points - for loadable kernel modules                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+int
+_init(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ret = ddi_soft_state_init(&drsas_state,
+	    sizeof (struct drsas_instance), 0);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state"));
+		return (ret);
+	}
+
+	if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba"));
+		ddi_soft_state_fini(&drsas_state);
+		return (ret);
+	}
+
+	ret = mod_install(&modlinkage);
+
+	if (ret != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed"));
+		scsi_hba_fini(&modlinkage);
+		ddi_soft_state_fini(&drsas_state);
+	}
+
+	return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int ret;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+		return (ret);
+
+	scsi_hba_fini(&modlinkage);
+
+	ddi_soft_state_fini(&drsas_state);
+
+	return (ret);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for autoconfiguration                  *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+
+static int
+drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int		instance_no;
+	int		nregs;
+	uint8_t		added_isr_f = 0;
+	uint8_t		added_soft_isr_f = 0;
+	uint8_t		create_devctl_node_f = 0;
+	uint8_t		create_scsi_node_f = 0;
+	uint8_t		create_ioc_node_f = 0;
+	uint8_t		tran_alloc_f = 0;
+	uint8_t 	irq;
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	uint16_t	command;
+	off_t		reglength = 0;
+	int		intr_types = 0;
+	char		*data;
+	int		msi_enable = 0;
+
+	scsi_hba_tran_t		*tran;
+	ddi_dma_attr_t  tran_dma_attr;
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	/*
+	 * check to see whether this device is in a DMA-capable slot.
+	 */
+	if (ddi_slaveonly(dip) == DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Device in slave-only slot, unused",
+		    instance_no));
+		return (DDI_FAILURE);
+	}
+
+	switch (cmd) {
+		case DDI_ATTACH:
+			con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH"));
+			/* allocate the soft state for the instance */
+			if (ddi_soft_state_zalloc(drsas_state, instance_no)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Failed to allocate soft state",
+				    instance_no));
+
+				return (DDI_FAILURE);
+			}
+
+			instance = (struct drsas_instance *)ddi_get_soft_state
+			    (drsas_state, instance_no);
+
+			if (instance == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: Bad soft state", instance_no));
+
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			bzero((caddr_t)instance,
+			    sizeof (struct drsas_instance));
+
+			instance->func_ptr = kmem_zalloc(
+			    sizeof (struct drsas_func_ptr), KM_SLEEP);
+			ASSERT(instance->func_ptr);
+
+			/* Setup the PCI configuration space handles */
+			if (pci_config_setup(dip, &instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas%d: pci config setup failed ",
+				    instance_no));
+
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to get registers."));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			vendor_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_VENID);
+			device_id = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_DEVID);
+
+			subsysvid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBVENID);
+			subsysid = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_SUBSYSID);
+
+			pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+			    (pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM) | PCI_COMM_ME));
+			irq = pci_config_get8(instance->pci_handle,
+			    PCI_CONF_ILINE);
+
+			con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+			    "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+			    instance_no, vendor_id, device_id, subsysvid,
+			    subsysid, irq, DRSAS_VERSION));
+
+			/* enable bus-mastering */
+			command = pci_config_get16(instance->pci_handle,
+			    PCI_CONF_COMM);
+
+			if (!(command & PCI_COMM_ME)) {
+				command |= PCI_COMM_ME;
+
+				pci_config_put16(instance->pci_handle,
+				    PCI_CONF_COMM, command);
+
+				con_log(CL_ANN, (CE_CONT, "dr_sas%d: "
+				    "enable bus-mastering", instance_no));
+			} else {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				"bus-mastering already set", instance_no));
+			}
+
+			/* initialize function pointers */
+			if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
+			    (device_id == PCI_DEVICE_ID_LSI_2108V)) {
+				con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+				    "2108V/DE detected", instance_no));
+				instance->func_ptr->read_fw_status_reg =
+				    read_fw_status_reg_ppc;
+				instance->func_ptr->issue_cmd = issue_cmd_ppc;
+				instance->func_ptr->issue_cmd_in_sync_mode =
+				    issue_cmd_in_sync_mode_ppc;
+				instance->func_ptr->issue_cmd_in_poll_mode =
+				    issue_cmd_in_poll_mode_ppc;
+				instance->func_ptr->enable_intr =
+				    enable_intr_ppc;
+				instance->func_ptr->disable_intr =
+				    disable_intr_ppc;
+				instance->func_ptr->intr_ack = intr_ack_ppc;
+			} else {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: Invalid device detected"));
+
+				pci_config_teardown(&instance->pci_handle);
+				kmem_free(instance->func_ptr,
+				    sizeof (struct drsas_func_ptr));
+				ddi_soft_state_free(drsas_state, instance_no);
+
+				return (DDI_FAILURE);
+			}
+
+			instance->baseaddress = pci_config_get32(
+			    instance->pci_handle, PCI_CONF_BASE0);
+			instance->baseaddress &= 0x0fffc;
+
+			instance->dip		= dip;
+			instance->vendor_id	= vendor_id;
+			instance->device_id	= device_id;
+			instance->subsysvid	= subsysvid;
+			instance->subsysid	= subsysid;
+			instance->instance	= instance_no;
+
+			/* Initialize FMA */
+			instance->fm_capabilities = ddi_prop_get_int(
+			    DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+			    "fm-capable", DDI_FM_EREPORT_CAPABLE |
+			    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+			    | DDI_FM_ERRCB_CAPABLE);
+
+			drsas_fm_init(instance);
+
+			/* Initialize Interrupts */
+			if ((ddi_dev_regsize(instance->dip,
+			    REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+			    reglength < MINIMUM_MFI_MEM_SZ) {
+				return (DDI_FAILURE);
+			}
+			if (reglength > DEFAULT_MFI_MEM_SZ) {
+				reglength = DEFAULT_MFI_MEM_SZ;
+				con_log(CL_DLEVEL1, (CE_NOTE,
+				    "dr_sas: register length to map is "
+				    "0x%lx bytes", reglength));
+			}
+			if (ddi_regs_map_setup(instance->dip,
+			    REGISTER_SET_IO_2108, &instance->regmap, 0,
+			    reglength, &endian_attr, &instance->regmap_handle)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_NOTE,
+				    "dr_sas: couldn't map control registers"));
+				goto fail_attach;
+			}
+
+			/*
+			 * Disable Interrupt Now.
+			 * Setup Software interrupt
+			 */
+			instance->func_ptr->disable_intr(instance);
+
+			msi_enable = 0;
+			if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+			    "drsas-enable-msi", &data) == DDI_SUCCESS) {
+				if (strncmp(data, "yes", 3) == 0) {
+					msi_enable = 1;
+					con_log(CL_ANN, (CE_WARN,
+					    "msi_enable = %d ENABLED",
+					    msi_enable));
+				}
+				ddi_prop_free(data);
+			}
+
+			con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
+			    msi_enable));
+
+			/* Check for all supported interrupt types */
+			if (ddi_intr_get_supported_types(
+			    dip, &intr_types) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "ddi_intr_get_supported_types() failed"));
+				goto fail_attach;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "ddi_intr_get_supported_types() ret: 0x%x",
+			    intr_types));
+
+			/* Initialize and Setup Interrupt handler */
+			if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSIX interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSIX;
+			} else if (msi_enable && (intr_types &
+			    DDI_INTR_TYPE_MSI)) {
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "MSI interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_MSI;
+			} else if (intr_types & DDI_INTR_TYPE_FIXED) {
+				msi_enable = 0;
+				if (drsas_add_intrs(instance,
+				    DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    "FIXED interrupt query failed"));
+					goto fail_attach;
+				}
+				instance->intr_type = DDI_INTR_TYPE_FIXED;
+			} else {
+				con_log(CL_ANN, (CE_WARN, "Device cannot "
+				    "suppport either FIXED or MSI/X "
+				    "interrupts"));
+				goto fail_attach;
+			}
+
+			added_isr_f = 1;
+
+			/* setup the mfi based low level driver */
+			if (init_mfi(instance) != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN, "dr_sas: "
+				"could not initialize the low level driver"));
+
+				goto fail_attach;
+			}
+
+			/* Initialize all Mutex */
+			INIT_LIST_HEAD(&instance->completed_pool_list);
+			mutex_init(&instance->completed_pool_mtx,
+			    "completed_pool_mtx", MUTEX_DRIVER,
+			    DDI_INTR_PRI(instance->intr_pri));
+
+			mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+			cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+
+			mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+			/* Register our soft-isr for highlevel interrupts. */
+			instance->isr_level = instance->intr_pri;
+			if (instance->isr_level == HIGH_LEVEL_INTR) {
+				if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+				    &instance->soft_intr_id, NULL, NULL,
+				    drsas_softintr, (caddr_t)instance) !=
+				    DDI_SUCCESS) {
+					con_log(CL_ANN, (CE_WARN,
+					    " Software ISR did not register"));
+
+					goto fail_attach;
+				}
+
+				added_soft_isr_f = 1;
+			}
+
+			/* Allocate a transport structure */
+			tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+
+			if (tran == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_tran_alloc failed"));
+				goto fail_attach;
+			}
+
+			tran_alloc_f = 1;
+
+			instance->tran = tran;
+
+			tran->tran_hba_private	= instance;
+			tran->tran_tgt_init	= drsas_tran_tgt_init;
+			tran->tran_tgt_probe	= scsi_hba_probe;
+			tran->tran_tgt_free	= drsas_tran_tgt_free;
+			tran->tran_init_pkt	= drsas_tran_init_pkt;
+			tran->tran_start	= drsas_tran_start;
+			tran->tran_abort	= drsas_tran_abort;
+			tran->tran_reset	= drsas_tran_reset;
+			tran->tran_getcap	= drsas_tran_getcap;
+			tran->tran_setcap	= drsas_tran_setcap;
+			tran->tran_destroy_pkt	= drsas_tran_destroy_pkt;
+			tran->tran_dmafree	= drsas_tran_dmafree;
+			tran->tran_sync_pkt	= drsas_tran_sync_pkt;
+			tran->tran_bus_config	= drsas_tran_bus_config;
+
+			tran_dma_attr = drsas_generic_dma_attr;
+			tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+			/* Attach this instance of the hba */
+			if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+			    != DDI_SUCCESS) {
+				con_log(CL_ANN, (CE_WARN,
+				    "scsi_hba_attach failed"));
+
+				goto fail_attach;
+			}
+
+			/* create devctl node for cfgadm command */
+			if (ddi_create_minor_node(dip, "devctl",
+			    S_IFCHR, INST2DEVCTL(instance_no),
+			    DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create devctl node."));
+
+				goto fail_attach;
+			}
+
+			create_devctl_node_f = 1;
+
+			/* create scsi node for cfgadm command */
+			if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+			    INST2SCSI(instance_no),
+			    DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+			    DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create scsi node."));
+
+				goto fail_attach;
+			}
+
+			create_scsi_node_f = 1;
+
+			(void) sprintf(instance->iocnode, "%d:lsirdctl",
+			    instance_no);
+
+			/*
+			 * Create a node for applications
+			 * for issuing ioctl to the driver.
+			 */
+			if (ddi_create_minor_node(dip, instance->iocnode,
+			    S_IFCHR, INST2LSIRDCTL(instance_no),
+			    DDI_PSEUDO, 0) == DDI_FAILURE) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create ioctl node."));
+
+				goto fail_attach;
+			}
+
+			create_ioc_node_f = 1;
+
+			/* Create a taskq to handle dr events */
+			if ((instance->taskq = ddi_taskq_create(dip,
+			    "drsas_dr_taskq", 1,
+			    TASKQ_DEFAULTPRI, 0)) == NULL) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to create taskq "));
+				instance->taskq = NULL;
+				goto fail_attach;
+			}
+
+			/* enable interrupt */
+			instance->func_ptr->enable_intr(instance);
+
+			/* initiate AEN */
+			if (start_mfi_aen(instance)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "dr_sas: failed to initiate AEN."));
+				goto fail_initiate_aen;
+			}
+
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "AEN started for instance %d.", instance_no));
+
+			/* Finally! We are on the air.  */
+			ddi_report_dev(dip);
+
+			if (drsas_check_acc_handle(instance->regmap_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			if (drsas_check_acc_handle(instance->pci_handle) !=
+			    DDI_SUCCESS) {
+				goto fail_attach;
+			}
+			instance->dr_ld_list =
+			    kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld),
+			    KM_SLEEP);
+			break;
+		case DDI_PM_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_PM_RESUME"));
+			break;
+		case DDI_RESUME:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: DDI_RESUME"));
+			break;
+		default:
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas: invalid attach cmd=%x", cmd));
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+
+fail_initiate_aen:
+fail_attach:
+	if (create_devctl_node_f) {
+		ddi_remove_minor_node(dip, "devctl");
+	}
+
+	if (create_scsi_node_f) {
+		ddi_remove_minor_node(dip, "scsi");
+	}
+
+	if (create_ioc_node_f) {
+		ddi_remove_minor_node(dip, instance->iocnode);
+	}
+
+	if (tran_alloc_f) {
+		scsi_hba_tran_free(tran);
+	}
+
+
+	if (added_soft_isr_f) {
+		ddi_remove_softintr(instance->soft_intr_id);
+	}
+
+	if (added_isr_f) {
+		drsas_rem_intrs(instance);
+	}
+
+	if (instance && instance->taskq) {
+		ddi_taskq_destroy(instance->taskq);
+	}
+
+	drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+	ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+	drsas_fm_fini(instance);
+
+	pci_config_teardown(&instance->pci_handle);
+
+	ddi_soft_state_free(drsas_state, instance_no);
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas: return failure from drsas_attach"));
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
+{
+	int	rval;
+	int	drsas_minor = getminor((dev_t)arg);
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	switch (cmd) {
+		case DDI_INFO_DEVT2DEVINFO:
+			instance = (struct drsas_instance *)
+			    ddi_get_soft_state(drsas_state,
+			    MINOR2INST(drsas_minor));
+
+			if (instance == NULL) {
+				*resultp = NULL;
+				rval = DDI_FAILURE;
+			} else {
+				*resultp = instance->dip;
+				rval = DDI_SUCCESS;
+			}
+			break;
+		case DDI_INFO_DEVT2INSTANCE:
+			*resultp = (void *)instance;
+			rval = DDI_SUCCESS;
+			break;
+		default:
+			*resultp = NULL;
+			rval = DDI_FAILURE;
+	}
+
+	return (rval);
+}
+
+static int
+drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* CONSTCOND */
+	ASSERT(NO_COMPETING_THREADS);
+
+	instance_no = ddi_get_instance(dip);
+
+	instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state,
+	    instance_no);
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas:%d could not get instance in detach",
+		    instance_no));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x",
+	    instance_no, instance->vendor_id, instance->device_id,
+	    instance->subsysvid, instance->subsysid));
+
+	switch (cmd) {
+	case DDI_DETACH:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_DETACH"));
+
+		if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN,
+			    "dr_sas:%d failed to detach",
+			    instance_no));
+
+			return (DDI_FAILURE);
+		}
+
+		scsi_hba_tran_free(instance->tran);
+
+		flush_cache(instance);
+
+		if (abort_aen_cmd(instance, instance->aen_cmd)) {
+			con_log(CL_ANN, (CE_WARN, "drsas_detach: "
+			    "failed to abort prevous AEN command"));
+
+			return (DDI_FAILURE);
+		}
+
+		instance->func_ptr->disable_intr(instance);
+
+		if (instance->isr_level == HIGH_LEVEL_INTR) {
+			ddi_remove_softintr(instance->soft_intr_id);
+		}
+
+		drsas_rem_intrs(instance);
+
+		if (instance->taskq) {
+			ddi_taskq_destroy(instance->taskq);
+		}
+		kmem_free(instance->dr_ld_list, MRDRV_MAX_LD
+		    * sizeof (struct drsas_ld));
+		free_space_for_mfi(instance);
+
+		drsas_fm_fini(instance);
+
+		pci_config_teardown(&instance->pci_handle);
+
+		kmem_free(instance->func_ptr,
+		    sizeof (struct drsas_func_ptr));
+
+		ddi_soft_state_free(drsas_state, instance_no);
+		break;
+	case DDI_PM_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_PM_SUSPEND"));
+
+		break;
+	case DDI_SUSPEND:
+		con_log(CL_ANN, (CE_NOTE,
+		    "drsas_detach: DDI_SUSPEND"));
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN,
+		    "invalid detach command:0x%x", cmd));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *             common entry points - for character driver types               *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+static  int
+drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* Check root permissions */
+	if (drv_priv(credp) != 0) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: Non-root ioctl access denied!"));
+		return (EPERM);
+	}
+
+	/* Verify we are being opened as a character device */
+	if (otyp != OTYP_CHR) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: ioctl node must be a char node"));
+		return (EINVAL);
+	}
+
+	if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev)))
+	    == NULL) {
+		return (ENXIO);
+	}
+
+	if (scsi_hba_open) {
+		rval = scsi_hba_open(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static  int
+drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
+{
+	int	rval = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* no need for locks! */
+
+	if (scsi_hba_close) {
+		rval = scsi_hba_close(dev, openflags, otyp, credp);
+	}
+
+	return (rval);
+}
+
+static int
+drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance;
+	struct drsas_ioctl	*ioctl;
+	struct drsas_aen	aen;
+	int i;
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev)));
+
+	if (instance == NULL) {
+		/* invalid minor number */
+		con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found."));
+		return (ENXIO);
+	}
+
+	ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl),
+	    KM_SLEEP);
+	ASSERT(ioctl);
+
+	switch ((uint_t)cmd) {
+		case DRSAS_IOCTL_FIRMWARE:
+			for (i = 0; i < sizeof (struct drsas_ioctl); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)ioctl+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN, "drsas_ioctl "
+					    "ERROR IOCTL copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+			if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) {
+				rval = handle_drv_ioctl(instance, ioctl, mode);
+			} else {
+				rval = handle_mfi_ioctl(instance, ioctl, mode);
+			}
+			for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) {
+				if (ddi_copyout((uint8_t *)ioctl+i,
+				    (uint8_t *)arg+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: ddi_copyout "
+					    "failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		case DRSAS_IOCTL_AEN:
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyin((uint8_t *)arg+i,
+				    (uint8_t *)&aen+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ERROR AEN copyin"));
+					kmem_free(ioctl,
+					    sizeof (struct drsas_ioctl));
+					return (EFAULT);
+				}
+			}
+
+			rval = handle_mfi_aen(instance, &aen);
+			for (i = 0; i < sizeof (struct drsas_aen); i++) {
+				if (ddi_copyout((uint8_t *)&aen + i,
+				    (uint8_t *)arg + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_ioctl: "
+					    "ddi_copyout failed"));
+					rval = 1;
+					break;
+				}
+			}
+
+			break;
+		default:
+			rval = scsi_hba_ioctl(dev, cmd, arg,
+			    mode, credp, rvalp);
+
+			con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: "
+			    "scsi_hba_ioctl called, ret = %x.", rval));
+	}
+
+	kmem_free(ioctl, sizeof (struct drsas_ioctl));
+	return (rval);
+}
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *               common entry points - for block driver types                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
+{
+	int	instance_no;
+
+	struct drsas_instance	*instance;
+
+	instance_no = ddi_get_instance(dip);
+	instance = (struct drsas_instance *)ddi_get_soft_state
+	    (drsas_state, instance_no);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (!instance) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter "
+		    "in reset", instance_no));
+		return (DDI_FAILURE);
+	}
+
+	instance->func_ptr->disable_intr(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+	    instance_no));
+
+	flush_cache(instance);
+
+	return (DDI_SUCCESS);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                          entry points (SCSI HBA)                           *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+		scsi_hba_tran_t *tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	uint16_t tgt = sd->sd_address.a_target;
+	uint8_t lun = sd->sd_address.a_lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d",
+	    tgt, lun));
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
+		(void) ndi_merge_node(tgt_dip, drsas_name_node);
+		ddi_set_name_addr(tgt_dip, NULL);
+
+		con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in "
+		    "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
+		    tgt, lun));
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p",
+	    (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == NULL &&
+		    strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+			instance->dr_ld_list[tgt].dip = tgt_dip;
+			instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN;
+		}
+	}
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+    scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+	struct drsas_instance *instance;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+
+	instance = ADDR2MR(&sd->sd_address);
+
+	con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+
+	if (tgt < MRDRV_MAX_LD && lun == 0) {
+		if (instance->dr_ld_list[tgt].dip == tgt_dip) {
+			instance->dr_ld_list[tgt].dip = NULL;
+		}
+	}
+}
+
+static dev_info_t *
+drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun)
+{
+	dev_info_t *child = NULL;
+	char addr[SCSI_MAXNAMELEN];
+	char tmp[MAXNAMELEN];
+
+	(void) sprintf(addr, "%x,%x", tgt, lun);
+	for (child = ddi_get_child(instance->dip); child;
+	    child = ddi_get_next_sibling(child)) {
+
+		if (drsas_name_node(child, tmp, MAXNAMELEN) !=
+		    DDI_SUCCESS) {
+			continue;
+		}
+
+		if (strcmp(addr, tmp) == 0) {
+			break;
+		}
+	}
+	con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p",
+	    (void *)child));
+	return (child);
+}
+
+static int
+drsas_name_node(dev_info_t *dip, char *name, int len)
+{
+	int tgt, lun;
+
+	tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "target", -1);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_name_node: dip %p tgt %d", (void *)dip, tgt));
+	if (tgt == -1) {
+		return (DDI_FAILURE);
+	}
+	lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "lun", -1);
+	con_log(CL_ANN1,
+	    (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun));
+	if (lun == -1) {
+		return (DDI_FAILURE);
+	}
+	(void) snprintf(name, len, "%x,%x", tgt, lun);
+	return (DDI_SUCCESS);
+}
+
+static struct scsi_pkt *
+drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
+	struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+	int flags, int (*callback)(), caddr_t arg)
+{
+	struct scsa_cmd	*acmd;
+	struct drsas_instance	*instance;
+	struct scsi_pkt	*new_pkt;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	instance = ADDR2MR(ap);
+
+	/* step #1 : pkt allocation */
+	if (pkt == NULL) {
+		pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+		    tgtlen, sizeof (struct scsa_cmd), callback, arg);
+		if (pkt == NULL) {
+			return (NULL);
+		}
+
+		acmd = PKT2CMD(pkt);
+
+		/*
+		 * Initialize the new pkt - we redundantly initialize
+		 * all the fields for illustrative purposes.
+		 */
+		acmd->cmd_pkt		= pkt;
+		acmd->cmd_flags		= 0;
+		acmd->cmd_scblen	= statuslen;
+		acmd->cmd_cdblen	= cmdlen;
+		acmd->cmd_dmahandle	= NULL;
+		acmd->cmd_ncookies	= 0;
+		acmd->cmd_cookie	= 0;
+		acmd->cmd_cookiecnt	= 0;
+		acmd->cmd_nwin		= 0;
+
+		pkt->pkt_address	= *ap;
+		pkt->pkt_comp		= (void (*)())NULL;
+		pkt->pkt_flags		= 0;
+		pkt->pkt_time		= 0;
+		pkt->pkt_resid		= 0;
+		pkt->pkt_state		= 0;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_reason		= 0;
+		new_pkt			= pkt;
+	} else {
+		acmd = PKT2CMD(pkt);
+		new_pkt = NULL;
+	}
+
+	/* step #2 : dma allocation/move */
+	if (bp && bp->b_bcount != 0) {
+		if (acmd->cmd_dmahandle == NULL) {
+			if (drsas_dma_alloc(instance, pkt, bp, flags,
+			    callback) == DDI_FAILURE) {
+				if (new_pkt) {
+					scsi_hba_pkt_free(ap, new_pkt);
+				}
+				return ((struct scsi_pkt *)NULL);
+			}
+		} else {
+			if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+				return ((struct scsi_pkt *)NULL);
+			}
+		}
+	}
+
+	return (pkt);
+}
+
+static int
+drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
+{
+	uchar_t 	cmd_done = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x",
+	    __func__, __LINE__, pkt->pkt_cdbp[0]));
+
+	pkt->pkt_reason	= CMD_CMPLT;
+	*pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */
+
+	cmd = build_cmd(instance, ap, pkt, &cmd_done);
+
+	/*
+	 * Check if the command is already completed by the drsas_build_cmd()
+	 * routine. In which case the busy_flag would be clear and scb will be
+	 * NULL and appropriate reason provided in pkt_reason field
+	 */
+	if (cmd_done) {
+		pkt->pkt_reason = CMD_CMPLT;
+		pkt->pkt_scbp[0] = STATUS_GOOD;
+		pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+		    | STATE_SENT_CMD;
+		if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+		return (TRAN_ACCEPT);
+	}
+
+	if (cmd == NULL) {
+		return (TRAN_BUSY);
+	}
+
+	if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+		if (instance->fw_outstanding > instance->max_fw_cmds) {
+			con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy"));
+			return_mfi_pkt(instance, cmd);
+			return (TRAN_BUSY);
+		}
+
+		/* Synchronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+		    DDI_DMA_SYNC_FORDEV);
+
+		instance->func_ptr->issue_cmd(cmd, instance);
+
+	} else {
+		struct drsas_header *hdr = &cmd->frame->hdr;
+
+		cmd->sync_cmd = DRSAS_TRUE;
+
+		instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd);
+
+		pkt->pkt_reason		= CMD_CMPLT;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle,
+		    &hdr->cmd_status)) {
+		case MFI_STAT_OK:
+			pkt->pkt_scbp[0] = STATUS_GOOD;
+			break;
+
+		case MFI_STAT_SCSI_DONE_WITH_ERROR:
+
+			pkt->pkt_reason	= CMD_CMPLT;
+			pkt->pkt_statistics = 0;
+
+			((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+			break;
+
+		case MFI_STAT_DEVICE_NOT_FOUND:
+			pkt->pkt_reason		= CMD_DEV_GONE;
+			pkt->pkt_statistics	= STAT_DISCON;
+			break;
+
+		default:
+			((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1;
+		}
+
+		return_mfi_pkt(instance, cmd);
+		(void) drsas_common_check(instance, cmd);
+
+		if (pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+	}
+
+	return (TRAN_ACCEPT);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* abort command not supported by H/W */
+
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_reset(struct scsi_address *ap, int level)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* reset command not supported by H/W */
+
+	return (DDI_FAILURE);
+
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
+{
+	int	rval = 0;
+
+	struct drsas_instance	*instance = ADDR2MR(ap);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* we do allow inquiring about capabilities for other targets */
+	if (cap == NULL) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+	case SCSI_CAP_DMA_MAX:
+		/* Limit to 16MB max transfer */
+		rval = drsas_max_cap_maxxfer;
+		break;
+	case SCSI_CAP_MSG_OUT:
+		rval = 1;
+		break;
+	case SCSI_CAP_DISCONNECT:
+		rval = 0;
+		break;
+	case SCSI_CAP_SYNCHRONOUS:
+		rval = 0;
+		break;
+	case SCSI_CAP_WIDE_XFER:
+		rval = 1;
+		break;
+	case SCSI_CAP_TAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_UNTAGGED_QING:
+		rval = 1;
+		break;
+	case SCSI_CAP_PARITY:
+		rval = 1;
+		break;
+	case SCSI_CAP_INITIATOR_ID:
+		rval = instance->init_id;
+		break;
+	case SCSI_CAP_ARQ:
+		rval = 1;
+		break;
+	case SCSI_CAP_LINKED_CMDS:
+		rval = 0;
+		break;
+	case SCSI_CAP_RESET_NOTIFICATION:
+		rval = 1;
+		break;
+	case SCSI_CAP_GEOMETRY:
+		rval = -1;
+
+		break;
+	default:
+		con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x",
+		    scsi_hba_lookup_capstr(cap)));
+		rval = -1;
+		break;
+	}
+
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
+{
+	int		rval = 1;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	/* We don't allow setting capabilities for other targets */
+	if (cap == NULL || whom == 0) {
+		return (-1);
+	}
+
+	switch (scsi_hba_lookup_capstr(cap)) {
+		case SCSI_CAP_DMA_MAX:
+		case SCSI_CAP_MSG_OUT:
+		case SCSI_CAP_PARITY:
+		case SCSI_CAP_LINKED_CMDS:
+		case SCSI_CAP_RESET_NOTIFICATION:
+		case SCSI_CAP_DISCONNECT:
+		case SCSI_CAP_SYNCHRONOUS:
+		case SCSI_CAP_UNTAGGED_QING:
+		case SCSI_CAP_WIDE_XFER:
+		case SCSI_CAP_INITIATOR_ID:
+		case SCSI_CAP_ARQ:
+			/*
+			 * None of these are settable via
+			 * the capability interface.
+			 */
+			break;
+		case SCSI_CAP_TAGGED_QING:
+			rval = 1;
+			break;
+		case SCSI_CAP_SECTOR_SIZE:
+			rval = 1;
+			break;
+
+		case SCSI_CAP_TOTAL_SECTORS:
+			rval = 1;
+			break;
+		default:
+			rval = -1;
+			break;
+	}
+
+	return (rval);
+}
+
+static void
+drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+
+	/* free the pkt */
+	scsi_hba_pkt_free(ap, pkt);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+		(void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+		ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+		acmd->cmd_dmahandle = NULL;
+	}
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	register struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	if (acmd->cmd_flags & CFLAG_DMAVALID) {
+		(void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset,
+		    acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ?
+		    DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU);
+	}
+}
+
+/*
+ * drsas_isr(caddr_t)
+ *
+ * The Interrupt Service Routine
+ *
+ * Collect status for all completed commands and do callback
+ *
+ */
+static uint_t
+drsas_isr(struct drsas_instance *instance)
+{
+	int		need_softintr;
+	uint32_t	producer;
+	uint32_t	consumer;
+	uint32_t	context;
+
+	struct drsas_cmd	*cmd;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	ASSERT(instance);
+	if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+	    !instance->func_ptr->intr_ack(instance)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORCPU);
+
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->producer);
+	consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer);
+
+	con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ",
+	    producer, consumer));
+	if (producer == consumer) {
+		con_log(CL_ANN1, (CE_WARN, "producer = consumer case"));
+		return (DDI_INTR_UNCLAIMED);
+	}
+	mutex_enter(&instance->completed_pool_mtx);
+
+	while (consumer != producer) {
+		context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+		    &instance->reply_queue[consumer]);
+		cmd = instance->cmd_list[context];
+		mlist_add_tail(&cmd->list, &instance->completed_pool_list);
+
+		consumer++;
+		if (consumer == (instance->max_fw_cmds + 1)) {
+			consumer = 0;
+		}
+	}
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	ddi_put32(instance->mfi_internal_dma_obj.acc_handle,
+	    instance->consumer, consumer);
+	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORDEV);
+
+	if (instance->softint_running) {
+		need_softintr = 0;
+	} else {
+		need_softintr = 1;
+	}
+
+	if (instance->isr_level == HIGH_LEVEL_INTR) {
+		if (need_softintr) {
+			ddi_trigger_softintr(instance->soft_intr_id);
+		}
+	} else {
+		/*
+		 * Not a high-level interrupt, therefore call the soft level
+		 * interrupt explicitly
+		 */
+		(void) drsas_softintr(instance);
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+
+/*
+ * ************************************************************************** *
+ *                                                                            *
+ *                                  libraries                                 *
+ *                                                                            *
+ * ************************************************************************** *
+ */
+/*
+ * get_mfi_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+static struct drsas_cmd *
+get_mfi_pkt(struct drsas_instance *instance)
+{
+	mlist_t 		*head = &instance->cmd_pool_list;
+	struct drsas_cmd	*cmd = NULL;
+
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	if (!mlist_empty(head)) {
+		cmd = mlist_entry(head->next, struct drsas_cmd, list);
+		mlist_del_init(head->next);
+	}
+	if (cmd != NULL)
+		cmd->pkt = NULL;
+	mutex_exit(&instance->cmd_pool_mtx);
+
+	return (cmd);
+}
+
+/*
+ * return_mfi_pkt : Return a cmd to free command pool
+ */
+static void
+return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+	mlist_add(&cmd->list, &instance->cmd_pool_list);
+
+	mutex_exit(&instance->cmd_pool_mtx);
+}
+
+/*
+ * destroy_mfi_frame_pool
+ */
+static void
+destroy_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	struct drsas_cmd	*cmd;
+
+	/* return all frames to pool */
+	for (i = 0; i < max_cmd+1; i++) {
+
+		cmd = instance->cmd_list[i];
+
+		if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED)
+			(void) drsas_free_dma_obj(instance, cmd->frame_dma_obj);
+
+		cmd->frame_dma_obj_status  = DMA_OBJ_FREED;
+	}
+
+}
+
+/*
+ * create_mfi_frame_pool
+ */
+static int
+create_mfi_frame_pool(struct drsas_instance *instance)
+{
+	int		i = 0;
+	int		cookie_cnt;
+	uint16_t	max_cmd;
+	uint16_t	sge_sz;
+	uint32_t	sgl_sz;
+	uint32_t	tot_frame_size;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	sge_sz	= sizeof (struct drsas_sge64);
+
+	/* calculated the number of 64byte frames required for SGL */
+	sgl_sz		= sge_sz * instance->max_num_sge;
+	tot_frame_size	= sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
+	    "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
+
+	while (i < max_cmd+1) {
+		cmd = instance->cmd_list[i];
+
+		cmd->frame_dma_obj.size	= tot_frame_size;
+		cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr;
+		cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
+
+
+		cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC);
+
+		if (cookie_cnt == -1 || cookie_cnt > 1) {
+			con_log(CL_ANN, (CE_WARN,
+			    "create_mfi_frame_pool: could not alloc."));
+			return (DDI_FAILURE);
+		}
+
+		bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
+
+		cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED;
+		cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer;
+		cmd->frame_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address;
+
+		cmd->sense = (uint8_t *)(((unsigned long)
+		    cmd->frame_dma_obj.buffer) +
+		    tot_frame_size - SENSE_LENGTH);
+		cmd->sense_phys_addr =
+		    cmd->frame_dma_obj.dma_cookie[0].dmac_address +
+		    tot_frame_size - SENSE_LENGTH;
+
+		if (!cmd->frame || !cmd->sense) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: pci_pool_alloc failed"));
+
+			return (ENOMEM);
+		}
+
+		ddi_put32(cmd->frame_dma_obj.acc_handle,
+		    &cmd->frame->io.context, cmd->index);
+		i++;
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x",
+		    cmd->index, cmd->frame_phys_addr));
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_additional_dma_buffer
+ */
+static void
+free_additional_dma_buffer(struct drsas_instance *instance)
+{
+	if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_internal_dma_obj);
+		instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) drsas_free_dma_obj(instance,
+		    instance->mfi_evt_detail_obj);
+		instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+	}
+}
+
+/*
+ * alloc_additional_dma_buffer
+ */
+static int
+alloc_additional_dma_buffer(struct drsas_instance *instance)
+{
+	uint32_t	reply_q_sz;
+	uint32_t	internal_buf_size = PAGESIZE*2;
+
+	/* max cmds plus 1 + producer & consumer */
+	reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2);
+
+	instance->mfi_internal_dma_obj.size = internal_buf_size;
+	instance->mfi_internal_dma_obj.dma_attr	= drsas_generic_dma_attr;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen	= 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas: could not alloc reply queue"));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+	instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	instance->producer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer);
+	instance->consumer = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 4);
+	instance->reply_queue = (uint32_t *)((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer + 8);
+	instance->internal_buf = (caddr_t)(((unsigned long)
+	    instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8);
+	instance->internal_buf_dmac_add =
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address +
+	    (reply_q_sz + 8);
+	instance->internal_buf_size = internal_buf_size -
+	    (reply_q_sz + 8);
+
+	/* allocate evt_detail */
+	instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail);
+	instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
+		    "could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_evt_detail_obj.buffer,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	/* already freed */
+	if (instance->cmd_list == NULL) {
+		return;
+	}
+
+	free_additional_dma_buffer(instance);
+
+	/* first free the MFI frame pool */
+	destroy_mfi_frame_pool(instance);
+
+	/* free all the commands in the cmd_list */
+	for (i = 0; i < instance->max_fw_cmds+1; i++) {
+		kmem_free(instance->cmd_list[i],
+		    sizeof (struct drsas_cmd));
+
+		instance->cmd_list[i] = NULL;
+	}
+
+	/* free the cmd_list buffer itself */
+	kmem_free(instance->cmd_list,
+	    sizeof (struct drsas_cmd *) * (max_cmd+1));
+
+	instance->cmd_list = NULL;
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct drsas_instance *instance)
+{
+	int		i;
+	uint32_t	max_cmd;
+	size_t		sz;
+
+	struct drsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	/* reserve 1 more slot for flush_cache */
+	sz = sizeof (struct drsas_cmd *) * (max_cmd+1);
+
+	/*
+	 * instance->cmd_list is an array of struct drsas_cmd pointers.
+	 * Allocate the dynamic array first and then allocate individual
+	 * commands.
+	 */
+	instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+	ASSERT(instance->cmd_list);
+
+	for (i = 0; i < max_cmd+1; i++) {
+		instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd),
+		    KM_SLEEP);
+		ASSERT(instance->cmd_list[i]);
+	}
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+
+	/* add all the commands to command pool (instance->cmd_pool) */
+	for (i = 0; i < max_cmd; i++) {
+		cmd		= instance->cmd_list[i];
+		cmd->index	= i;
+
+		mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+	}
+
+	/* single slot for flush_cache won't be added in command pool */
+	cmd		= instance->cmd_list[max_cmd];
+	cmd->index	= i;
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (create_mfi_frame_pool(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	/* create a frame pool and assign one frame to each cmd */
+	if (alloc_additional_dma_buffer(instance)) {
+		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_ctrl_info
+ */
+static int
+get_ctrl_info(struct drsas_instance *instance,
+    struct drsas_ctrl_info *ctrl_info)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	struct drsas_ctrl_info	*ci;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	ci = (struct drsas_ctrl_info *)instance->internal_buf;
+
+	if (!ci) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to alloc mem for ctrl info"));
+		return_mfi_pkt(instance, cmd);
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(ci, 0, sizeof (struct drsas_ctrl_info));
+
+	/* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_ctrl_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->internal_buf_dmac_add);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_ctrl_info));
+
+	cmd->frame_count = 1;
+
+	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		ret = 0;
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+		    (uint8_t *)ctrl_info, (uint8_t *)ci,
+		    sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR);
+	} else {
+		con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+		ret = -1;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = -1;
+	}
+
+	return (ret);
+}
+
+/*
+ * abort_aen_cmd
+ */
+static int
+abort_aen_cmd(struct drsas_instance *instance,
+    struct drsas_cmd *cmd_to_abort)
+{
+	int	ret = 0;
+
+	struct drsas_cmd		*cmd;
+	struct drsas_abort_frame	*abort_fr;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN,
+		    "Failed to get a cmd for ctrl info"));
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	abort_fr = &cmd->frame->abort;
+
+	/* prepare and issue the abort frame */
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->cmd, MFI_CMD_OP_ABORT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+	    MFI_CMD_STATUS_SYNC_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+	    cmd_to_abort->index);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+	instance->aen_cmd->abort_aen = 1;
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "abort_aen_cmd: issue_cmd_in_sync_mode failed"));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	instance->aen_cmd->abort_aen = 1;
+	instance->aen_cmd = 0;
+
+	return_mfi_pkt(instance, cmd);
+	(void) drsas_common_check(instance, cmd);
+
+	return (ret);
+}
+
+/*
+ * init_mfi
+ */
+static int
+init_mfi(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd;
+	struct drsas_ctrl_info		ctrl_info;
+	struct drsas_init_frame		*init_frame;
+	struct drsas_init_queue_info	*initq_info;
+
+	/* we expect the FW state to be READY */
+	if (mfi_state_transition_to_ready(instance)) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready"));
+		goto fail_ready_state;
+	}
+
+	/* get various operational parameters from status register */
+	instance->max_num_sge =
+	    (instance->func_ptr->read_fw_status_reg(instance) &
+	    0xFF0000) >> 0x10;
+	/*
+	 * Reduce the max supported cmds by 1. This is to ensure that the
+	 * reply_q_sz (1 more than the max cmd that driver may send)
+	 * does not exceed max cmds that the FW can support
+	 */
+	instance->max_fw_cmds =
+	    instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+	instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+	instance->max_num_sge =
+	    (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ?
+	    DRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+	/* create a pool of commands */
+	if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
+		goto fail_alloc_fw_space;
+
+	/*
+	 * Prepare a init frame. Note the init frame points to queue info
+	 * structure. Each frame has SGL allocated after first 64 bytes. For
+	 * this frame - since we don't need any SGL - we use SGL's space as
+	 * queue info structure
+	 */
+	cmd = get_mfi_pkt(instance);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	init_frame = (struct drsas_init_frame *)cmd->frame;
+	initq_info = (struct drsas_init_queue_info *)
+	    ((unsigned long)init_frame + 64);
+
+	(void) memset(init_frame, 0, MRMFI_FRAME_SIZE);
+	(void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info));
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_entries, instance->max_fw_cmds + 1);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->producer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->consumer_index_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_hi, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &initq_info->reply_queue_start_phys_addr_lo,
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->cmd, MFI_CMD_OP_INIT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_lo,
+	    cmd->frame_phys_addr + 64);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &init_frame->queue_info_new_phys_addr_hi, 0);
+
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
+	    sizeof (struct drsas_init_queue_info));
+
+	cmd->frame_count = 1;
+
+	/* issue the init frame in polled mode */
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
+		goto fail_fw_init;
+	}
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	/* gather misc FW related information */
+	if (!get_ctrl_info(instance, &ctrl_info)) {
+		instance->max_sectors_per_req = ctrl_info.max_request_size;
+		con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d",
+		    ctrl_info.product_name, ctrl_info.ld_present_count));
+	} else {
+		instance->max_sectors_per_req = instance->max_num_sge *
+		    PAGESIZE / 512;
+	}
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		goto fail_fw_init;
+	}
+
+	return (DDI_SUCCESS);
+
+fail_fw_init:
+fail_alloc_fw_space:
+
+	free_space_for_mfi(instance);
+
+fail_ready_state:
+	ddi_regs_map_free(&instance->regmap_handle);
+
+fail_mfi_reg_setup:
+	return (DDI_FAILURE);
+}
+
+/*
+ * mfi_state_transition_to_ready	: Move the FW to READY state
+ *
+ * @reg_set			: MFI register set
+ */
+static int
+mfi_state_transition_to_ready(struct drsas_instance *instance)
+{
+	int		i;
+	uint8_t		max_wait;
+	uint32_t	fw_ctrl;
+	uint32_t	fw_state;
+	uint32_t	cur_state;
+
+	fw_state =
+	    instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK;
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
+
+	while (fw_state != MFI_STATE_READY) {
+		con_log(CL_ANN, (CE_NOTE,
+		    "mfi_state_transition_to_ready:FW state%x", fw_state));
+
+		switch (fw_state) {
+		case MFI_STATE_FAULT:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW in FAULT state!!"));
+
+			return (ENODEV);
+		case MFI_STATE_WAIT_HANDSHAKE:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW waiting for HANDSHAKE"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			/* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
+			WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+			    MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_WAIT_HANDSHAKE;
+			break;
+		case MFI_STATE_BOOT_MESSAGE_PENDING:
+			/* set the CLR bit in IMR0 */
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: FW state boot message pending"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+			 * to be set
+			 */
+			WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_BOOT_MESSAGE_PENDING;
+			break;
+		case MFI_STATE_OPERATIONAL:
+			/* bring it to READY state; assuming max wait 2 secs */
+			instance->func_ptr->disable_intr(instance);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: FW in OPERATIONAL state"));
+			/*
+			 * PCI_Hot Plug: MFI F/W requires
+			 * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT)
+			 * to be set
+			 */
+			/* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
+			WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+
+			max_wait	= 10;
+			cur_state	= MFI_STATE_OPERATIONAL;
+			break;
+		case MFI_STATE_UNDEFINED:
+			/* this state should not last for more than 2 seconds */
+			con_log(CL_ANN, (CE_NOTE, "FW state undefined"));
+
+			max_wait	= 2;
+			cur_state	= MFI_STATE_UNDEFINED;
+			break;
+		case MFI_STATE_BB_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_BB_INIT;
+			break;
+		case MFI_STATE_FW_INIT:
+			max_wait	= 2;
+			cur_state	= MFI_STATE_FW_INIT;
+			break;
+		case MFI_STATE_DEVICE_SCAN:
+			max_wait	= 10;
+			cur_state	= MFI_STATE_DEVICE_SCAN;
+			break;
+		default:
+			con_log(CL_ANN, (CE_NOTE,
+			    "dr_sas: Unknown state 0x%x", fw_state));
+			return (ENODEV);
+		}
+
+		/* the cur_state should not last for more than max_wait secs */
+		for (i = 0; i < (max_wait * MILLISEC); i++) {
+			/* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */
+			fw_state =
+			    instance->func_ptr->read_fw_status_reg(instance) &
+			    MFI_STATE_MASK;
+
+			if (fw_state == cur_state) {
+				delay(1 * drv_usectohz(MILLISEC));
+			} else {
+				break;
+			}
+		}
+
+		/* return error if fw_state hasn't changed after max_wait */
+		if (fw_state == cur_state) {
+			con_log(CL_ANN, (CE_NOTE,
+			    "FW state hasn't changed in %d secs", max_wait));
+			return (ENODEV);
+		}
+	};
+
+	fw_ctrl = RD_IB_DOORBELL(instance);
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+
+	/*
+	 * Write 0xF to the doorbell register to do the following.
+	 * - Abort all outstanding commands (bit 0).
+	 * - Transition from OPERATIONAL to READY state (bit 1).
+	 * - Discard (possible) low MFA posted in 64-bit mode (bit-2).
+	 * - Set to release FW to continue running (i.e. BIOS handshake
+	 *   (bit 3).
+	 */
+	WR_IB_DOORBELL(0xF, instance);
+
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		return (ENODEV);
+	}
+	return (DDI_SUCCESS);
+}
+
+/*
+ * get_seq_num
+ */
+static int
+get_seq_num(struct drsas_instance *instance,
+    struct drsas_evt_log_info *eli)
+{
+	int	ret = DDI_SUCCESS;
+
+	dma_obj_t			dcmd_dma_obj;
+	struct drsas_cmd		*cmd;
+	struct drsas_dcmd_frame		*dcmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		cmn_err(CE_WARN, "dr_sas: failed to get a cmd");
+		return (ENOMEM);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd	= &cmd->frame->dcmd;
+
+	/* allocate the data transfer buffer */
+	dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info);
+	dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+	dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+	if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "get_seq_num: could not allocate data transfer buffer."));
+		return (DDI_FAILURE);
+	}
+
+	(void) memset(dcmd_dma_obj.buffer, 0,
+	    sizeof (struct drsas_evt_log_info));
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_log_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		cmn_err(CE_WARN, "get_seq_num: "
+		    "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO");
+		ret = DDI_FAILURE;
+	} else {
+		/* copy the data back into callers buffer */
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli,
+		    (uint8_t *)dcmd_dma_obj.buffer,
+		    sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR);
+		ret = DDI_SUCCESS;
+	}
+
+	if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+		ret = DDI_FAILURE;
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		ret = DDI_FAILURE;
+	}
+	return (ret);
+}
+
+/*
+ * start_mfi_aen
+ */
+static int
+start_mfi_aen(struct drsas_instance *instance)
+{
+	int	ret = 0;
+
+	struct drsas_evt_log_info	eli;
+	union drsas_evt_class_locale	class_locale;
+
+	/* get the latest sequence number from FW */
+	(void) memset(&eli, 0, sizeof (struct drsas_evt_log_info));
+
+	if (get_seq_num(instance, &eli)) {
+		cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num");
+		return (-1);
+	}
+
+	/* register AEN with FW for latest sequence number plus 1 */
+	class_locale.members.reserved	= 0;
+	class_locale.members.locale	= DR_EVT_LOCALE_ALL;
+	class_locale.members.class	= DR_EVT_CLASS_INFO;
+	ret = register_mfi_aen(instance, eli.newest_seq_num + 1,
+	    class_locale.word);
+
+	if (ret) {
+		cmn_err(CE_WARN, "start_mfi_aen: aen registration failed");
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * flush_cache
+ */
+static void
+flush_cache(struct drsas_instance *instance)
+{
+	struct drsas_cmd		*cmd = NULL;
+	struct drsas_dcmd_frame		*dcmd;
+	uint32_t	max_cmd = instance->max_fw_cmds;
+
+	cmd = instance->cmd_list[max_cmd];
+
+	if (cmd == NULL)
+		return;
+
+	dcmd = &cmd->frame->dcmd;
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_NONE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_CACHE_FLUSH);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0],
+	    DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE);
+
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN1, (CE_WARN,
+	    "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
+	}
+	con_log(CL_DLEVEL1, (CE_NOTE, "done"));
+}
+
+/*
+ * service_mfi_aen-	Completes an AEN command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ */
+static void
+service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+	uint32_t	seq_num;
+	struct drsas_evt_detail *evt_detail =
+	    (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
+	int		rval = 0;
+	int		tgt = 0;
+	ddi_acc_handle_t		acc_handle;
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	/*
+	 * log the MFI AEN event to the sysevent queue so that
+	 * application will get noticed
+	 */
+	if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS",
+	    NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) {
+		int	instance_no = ddi_get_instance(instance->dip);
+		con_log(CL_ANN, (CE_WARN,
+		    "dr_sas%d: Failed to log AEN event", instance_no));
+	}
+	/*
+	 * Check for any ld devices that has changed state. i.e. online
+	 * or offline.
+	 */
+	con_log(CL_ANN1, (CE_NOTE,
+	    "AEN: code = %x class = %x locale = %x args = %x",
+	    ddi_get32(acc_handle, &evt_detail->code),
+	    evt_detail->cl.members.class,
+	    ddi_get16(acc_handle, &evt_detail->cl.members.locale),
+	    ddi_get8(acc_handle, &evt_detail->arg_type)));
+
+	switch (ddi_get32(acc_handle, &evt_detail->code)) {
+	case DR_EVT_CFG_CLEARED: {
+		for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+			if (instance->dr_ld_list[tgt].dip != NULL) {
+				rval = drsas_service_evt(instance, tgt, 0,
+				    DRSAS_EVT_UNCONFIG_TGT, NULL);
+				con_log(CL_ANN1, (CE_WARN,
+				    "dr_sas: CFG CLEARED AEN rval = %d "
+				    "tgt id = %d", rval, tgt));
+			}
+		}
+		break;
+	}
+
+	case DR_EVT_LD_DELETED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_UNCONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_DELETED */
+
+	case DR_EVT_LD_CREATED: {
+		rval = drsas_service_evt(instance,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+		    DRSAS_EVT_CONFIG_TGT, NULL);
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d "
+		    "tgt id = %d index = %d", rval,
+		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+		break;
+	} /* End of DR_EVT_LD_CREATED */
+	} /* End of Main Switch */
+
+	/* get copy of seq_num and class/locale for re-registration */
+	seq_num = ddi_get32(acc_handle, &evt_detail->seq_num);
+	seq_num++;
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0);
+	ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num);
+
+	instance->aen_seq_num = seq_num;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	instance->func_ptr->issue_cmd(cmd, instance);
+}
+
+/*
+ * complete_cmd_in_sync_mode -	Completes an internal command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+static void
+complete_cmd_in_sync_mode(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+	    &cmd->frame->io.cmd_status);
+
+	cmd->sync_cmd = DRSAS_FALSE;
+
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+
+	cv_broadcast(&instance->int_cmd_cv);
+}
+
+/*
+ * drsas_softintr - The Software ISR
+ * @param arg	: HBA soft state
+ *
+ * called from high-level interrupt if hi-level interrupt are not there,
+ * otherwise triggered as a soft interrupt
+ */
+static uint_t
+drsas_softintr(struct drsas_instance *instance)
+{
+	struct scsi_pkt		*pkt;
+	struct scsa_cmd		*acmd;
+	struct drsas_cmd	*cmd;
+	struct mlist_head	*pos, *next;
+	mlist_t			process_list;
+	struct drsas_header	*hdr;
+	struct scsi_arq_status	*arqstat;
+
+	con_log(CL_ANN1, (CE_CONT, "drsas_softintr called"));
+
+	ASSERT(instance);
+	mutex_enter(&instance->completed_pool_mtx);
+
+	if (mlist_empty(&instance->completed_pool_list)) {
+		mutex_exit(&instance->completed_pool_mtx);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	instance->softint_running = 1;
+
+	INIT_LIST_HEAD(&process_list);
+	mlist_splice(&instance->completed_pool_list, &process_list);
+	INIT_LIST_HEAD(&instance->completed_pool_list);
+
+	mutex_exit(&instance->completed_pool_mtx);
+
+	/* perform all callbacks first, before releasing the SCBs */
+	mlist_for_each_safe(pos, next, &process_list) {
+		cmd = mlist_entry(pos, struct drsas_cmd, list);
+
+		/* syncronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle,
+		    0, 0, DDI_DMA_SYNC_FORCPU);
+
+		if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+		    DDI_SUCCESS) {
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+			return (DDI_INTR_UNCLAIMED);
+		}
+
+		hdr = &cmd->frame->hdr;
+
+		/* remove the internal command from the process list */
+		mlist_del_init(&cmd->list);
+
+		switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) {
+		case MFI_CMD_OP_PD_SCSI:
+		case MFI_CMD_OP_LD_SCSI:
+		case MFI_CMD_OP_LD_READ:
+		case MFI_CMD_OP_LD_WRITE:
+			/*
+			 * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI
+			 * could have been issued either through an
+			 * IO path or an IOCTL path. If it was via IOCTL,
+			 * we will send it to internal completion.
+			 */
+			if (cmd->sync_cmd == DRSAS_TRUE) {
+				complete_cmd_in_sync_mode(instance, cmd);
+				break;
+			}
+
+			/* regular commands */
+			acmd =	cmd->cmd;
+			pkt =	CMD2PKT(acmd);
+
+			if (acmd->cmd_flags & CFLAG_DMAVALID) {
+				if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+					(void) ddi_dma_sync(acmd->cmd_dmahandle,
+					    acmd->cmd_dma_offset,
+					    acmd->cmd_dma_len,
+					    DDI_DMA_SYNC_FORCPU);
+				}
+			}
+
+			pkt->pkt_reason		= CMD_CMPLT;
+			pkt->pkt_statistics	= 0;
+			pkt->pkt_state = STATE_GOT_BUS
+			    | STATE_GOT_TARGET | STATE_SENT_CMD
+			    | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+			con_log(CL_ANN1, (CE_CONT,
+			    "CDB[0] = %x completed for %s: size %lx context %x",
+			    pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
+			    acmd->cmd_dmacount, hdr->context));
+
+			if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+				struct scsi_inquiry	*inq;
+
+				if (acmd->cmd_dmacount != 0) {
+					bp_mapin(acmd->cmd_buf);
+					inq = (struct scsi_inquiry *)
+					    acmd->cmd_buf->b_un.b_addr;
+
+					/* don't expose physical drives to OS */
+					if (acmd->islogical &&
+					    (hdr->cmd_status == MFI_STAT_OK)) {
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+					} else if ((hdr->cmd_status ==
+					    MFI_STAT_OK) && inq->inq_dtype ==
+					    DTYPE_DIRECT) {
+
+						display_scsi_inquiry(
+						    (caddr_t)inq);
+
+						/* for physical disk */
+						hdr->cmd_status =
+						    MFI_STAT_DEVICE_NOT_FOUND;
+					}
+				}
+			}
+
+			switch (hdr->cmd_status) {
+			case MFI_STAT_OK:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_CC_IN_PROGRESS:
+			case MFI_STAT_LD_RECON_IN_PROGRESS:
+				pkt->pkt_scbp[0] = STATUS_GOOD;
+				break;
+			case MFI_STAT_LD_INIT_IN_PROGRESS:
+				con_log(CL_ANN,
+				    (CE_WARN, "Initialization in Progress"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			case MFI_STAT_SCSI_DONE_WITH_ERROR:
+				con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+
+					con_log(CL_ANN,
+					    (CE_WARN, "TEST_UNIT_READY fail"));
+
+				} else {
+					pkt->pkt_state |= STATE_ARQ_DONE;
+					arqstat = (void *)(pkt->pkt_scbp);
+					arqstat->sts_rqpkt_reason = CMD_CMPLT;
+					arqstat->sts_rqpkt_resid = 0;
+					arqstat->sts_rqpkt_state |=
+					    STATE_GOT_BUS | STATE_GOT_TARGET
+					    | STATE_SENT_CMD
+					    | STATE_XFERRED_DATA;
+					*(uint8_t *)&arqstat->sts_rqpkt_status =
+					    STATUS_GOOD;
+					ddi_rep_get8(
+					    cmd->frame_dma_obj.acc_handle,
+					    (uint8_t *)
+					    &(arqstat->sts_sensedata),
+					    cmd->sense,
+					    acmd->cmd_scblen -
+					    offsetof(struct scsi_arq_status,
+					    sts_sensedata), DDI_DEV_AUTOINCR);
+				}
+				break;
+			case MFI_STAT_LD_OFFLINE:
+			case MFI_STAT_DEVICE_NOT_FOUND:
+				con_log(CL_ANN1, (CE_CONT,
+				    "device not found error"));
+				pkt->pkt_reason	= CMD_DEV_GONE;
+				pkt->pkt_statistics  = STAT_DISCON;
+				break;
+			case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+				pkt->pkt_state |= STATE_ARQ_DONE;
+				pkt->pkt_reason	= CMD_CMPLT;
+				((struct scsi_status *)
+				    pkt->pkt_scbp)->sts_chk = 1;
+
+				arqstat = (void *)(pkt->pkt_scbp);
+				arqstat->sts_rqpkt_reason = CMD_CMPLT;
+				arqstat->sts_rqpkt_resid = 0;
+				arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+				    | STATE_GOT_TARGET | STATE_SENT_CMD
+				    | STATE_XFERRED_DATA;
+				*(uint8_t *)&arqstat->sts_rqpkt_status =
+				    STATUS_GOOD;
+
+				arqstat->sts_sensedata.es_valid = 1;
+				arqstat->sts_sensedata.es_key =
+				    KEY_ILLEGAL_REQUEST;
+				arqstat->sts_sensedata.es_class =
+				    CLASS_EXTENDED_SENSE;
+
+				/*
+				 * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+				 * ASC: 0x21h; ASCQ: 0x00h;
+				 */
+				arqstat->sts_sensedata.es_add_code = 0x21;
+				arqstat->sts_sensedata.es_qual_code = 0x00;
+
+				break;
+
+			default:
+				con_log(CL_ANN, (CE_CONT, "Unknown status!"));
+				pkt->pkt_reason	= CMD_TRAN_ERR;
+
+				break;
+			}
+
+			atomic_add_16(&instance->fw_outstanding, (-1));
+
+			return_mfi_pkt(instance, cmd);
+
+			(void) drsas_common_check(instance, cmd);
+
+			if (acmd->cmd_dmahandle) {
+				if (drsas_check_dma_handle(
+				    acmd->cmd_dmahandle) != DDI_SUCCESS) {
+					ddi_fm_service_impact(instance->dip,
+					    DDI_SERVICE_UNAFFECTED);
+					pkt->pkt_reason = CMD_TRAN_ERR;
+					pkt->pkt_statistics = 0;
+				}
+			}
+
+			/* Call the callback routine */
+			if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+			    pkt->pkt_comp) {
+				(*pkt->pkt_comp)(pkt);
+			}
+
+			break;
+		case MFI_CMD_OP_SMP:
+		case MFI_CMD_OP_STP:
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		case MFI_CMD_OP_DCMD:
+			/* see if got an event notification */
+			if (ddi_get32(cmd->frame_dma_obj.acc_handle,
+			    &cmd->frame->dcmd.opcode) ==
+			    DR_DCMD_CTRL_EVENT_WAIT) {
+				if ((instance->aen_cmd == cmd) &&
+				    (instance->aen_cmd->abort_aen)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "drsas_softintr: "
+					    "aborted_aen returned"));
+				} else {
+					atomic_add_16(&instance->fw_outstanding,
+					    (-1));
+					service_mfi_aen(instance, cmd);
+				}
+			} else {
+				complete_cmd_in_sync_mode(instance, cmd);
+			}
+
+			break;
+		case MFI_CMD_OP_ABORT:
+			con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+			/*
+			 * MFI_CMD_OP_ABORT successfully completed
+			 * in the synchronous mode
+			 */
+			complete_cmd_in_sync_mode(instance, cmd);
+			break;
+		default:
+			drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+			if (cmd->pkt != NULL) {
+				pkt = cmd->pkt;
+				if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+				    pkt->pkt_comp) {
+					(*pkt->pkt_comp)(pkt);
+				}
+			}
+			con_log(CL_ANN, (CE_WARN, "Cmd type unknown !"));
+			break;
+		}
+	}
+
+	instance->softint_running = 0;
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * drsas_alloc_dma_obj
+ *
+ * Allocate the memory and other resources for an dma object.
+ */
+static int
+drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj,
+    uchar_t endian_flags)
+{
+	int	i;
+	size_t	alen = 0;
+	uint_t	cookie_cnt;
+	struct ddi_device_acc_attr tmp_endian_attr;
+
+	tmp_endian_attr = endian_attr;
+	tmp_endian_attr.devacc_attr_endian_flags = endian_flags;
+
+	i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr,
+	    DDI_DMA_SLEEP, NULL, &obj->dma_handle);
+	if (i != DDI_SUCCESS) {
+
+		switch (i) {
+			case DDI_DMA_BADATTR :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- Bad attribute"));
+				break;
+			case DDI_DMA_NORESOURCES :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle- No Resources"));
+				break;
+			default :
+				con_log(CL_ANN, (CE_WARN,
+				"Failed ddi_dma_alloc_handle: "
+				"unknown status %d", i));
+				break;
+		}
+
+		return (-1);
+	}
+
+	if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr,
+	    DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
+	    &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) ||
+	    alen < obj->size) {
+
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+
+		return (-1);
+	}
+
+	if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
+	    obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
+	    NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) {
+
+		ddi_dma_mem_free(&obj->acc_handle);
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+
+		return (-1);
+	}
+
+	if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (-1);
+	}
+
+	return (cookie_cnt);
+}
+
+/*
+ * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t)
+ *
+ * De-allocate the memory and other resources for an dma object, which must
+ * have been alloated by a previous call to drsas_alloc_dma_obj()
+ */
+static int
+drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj)
+{
+
+	if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		return (DDI_FAILURE);
+	}
+
+	(void) ddi_dma_unbind_handle(obj.dma_handle);
+	ddi_dma_mem_free(&obj.acc_handle);
+	ddi_dma_free_handle(&obj.dma_handle);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *,
+ * int, int (*)())
+ *
+ * Allocate dma resources for a new scsi command
+ */
+static int
+drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp, int flags, int (*callback)())
+{
+	int	dma_flags;
+	int	(*cb)(caddr_t);
+	int	i;
+
+	ddi_dma_attr_t	tmp_dma_attr = drsas_generic_dma_attr;
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	acmd->cmd_buf = bp;
+
+	if (bp->b_flags & B_READ) {
+		acmd->cmd_flags &= ~CFLAG_DMASEND;
+		dma_flags = DDI_DMA_READ;
+	} else {
+		acmd->cmd_flags |= CFLAG_DMASEND;
+		dma_flags = DDI_DMA_WRITE;
+	}
+
+	if (flags & PKT_CONSISTENT) {
+		acmd->cmd_flags |= CFLAG_CONSISTENT;
+		dma_flags |= DDI_DMA_CONSISTENT;
+	}
+
+	if (flags & PKT_DMA_PARTIAL) {
+		dma_flags |= DDI_DMA_PARTIAL;
+	}
+
+	dma_flags |= DDI_DMA_REDZONE;
+
+	cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+	tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+	tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+
+	if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
+	    cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
+		switch (i) {
+		case DDI_DMA_BADATTR:
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+
+		case DDI_DMA_NORESOURCES:
+			bioerror(bp, 0);
+			return (DDI_FAILURE);
+
+		default:
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: "
+			    "impossible result (0x%x)", i));
+			bioerror(bp, EFAULT);
+			return (DDI_FAILURE);
+		}
+	}
+
+	i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags,
+	    cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies);
+
+	switch (i) {
+	case DDI_DMA_PARTIAL_MAP:
+		if ((dma_flags & DDI_DMA_PARTIAL) == 0) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+			    "DDI_DMA_PARTIAL_MAP impossible"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) ==
+		    DDI_FAILURE) {
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed"));
+			goto no_dma_cookies;
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+
+			con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed"));
+			goto no_dma_cookies;
+		}
+
+		goto get_dma_cookies;
+	case DDI_DMA_MAPPED:
+		acmd->cmd_nwin = 1;
+		acmd->cmd_dma_len = 0;
+		acmd->cmd_dma_offset = 0;
+
+get_dma_cookies:
+		i = 0;
+		acmd->cmd_dmacount = 0;
+		for (;;) {
+			acmd->cmd_dmacount +=
+			    acmd->cmd_dmacookies[i++].dmac_size;
+
+			if (i == instance->max_num_sge ||
+			    i == acmd->cmd_ncookies)
+				break;
+
+			ddi_dma_nextcookie(acmd->cmd_dmahandle,
+			    &acmd->cmd_dmacookies[i]);
+		}
+
+		acmd->cmd_cookie = i;
+		acmd->cmd_cookiecnt = i;
+
+		acmd->cmd_flags |= CFLAG_DMAVALID;
+
+		if (bp->b_bcount >= acmd->cmd_dmacount) {
+			pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+		} else {
+			pkt->pkt_resid = 0;
+		}
+
+		return (DDI_SUCCESS);
+	case DDI_DMA_NORESOURCES:
+		bioerror(bp, 0);
+		break;
+	case DDI_DMA_NOMAPPING:
+		bioerror(bp, EFAULT);
+		break;
+	case DDI_DMA_TOOBIG:
+		bioerror(bp, EINVAL);
+		break;
+	case DDI_DMA_INUSE:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:"
+		    " DDI_DMA_INUSE impossible"));
+		break;
+	default:
+		con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+		    "impossible result (0x%x)", i));
+		break;
+	}
+
+no_dma_cookies:
+	ddi_dma_free_handle(&acmd->cmd_dmahandle);
+	acmd->cmd_dmahandle = NULL;
+	acmd->cmd_flags &= ~CFLAG_DMAVALID;
+	return (DDI_FAILURE);
+}
+
+/*
+ * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *)
+ *
+ * move dma resources to next dma window
+ *
+ */
+static int
+drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt,
+    struct buf *bp)
+{
+	int	i = 0;
+
+	struct scsa_cmd	*acmd = PKT2CMD(pkt);
+
+	/*
+	 * If there are no more cookies remaining in this window,
+	 * must move to the next window first.
+	 */
+	if (acmd->cmd_cookie == acmd->cmd_ncookies) {
+		if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) {
+			return (DDI_SUCCESS);
+		}
+
+		/* at last window, cannot move */
+		if (++acmd->cmd_curwin >= acmd->cmd_nwin) {
+			return (DDI_FAILURE);
+		}
+
+		if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+		    &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+		    &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+		    DDI_FAILURE) {
+			return (DDI_FAILURE);
+		}
+
+		acmd->cmd_cookie = 0;
+	} else {
+		/* still more cookies in this window - get the next one */
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[0]);
+	}
+
+	/* get remaining cookies in this window, up to our maximum */
+	for (;;) {
+		acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size;
+		acmd->cmd_cookie++;
+
+		if (i == instance->max_num_sge ||
+		    acmd->cmd_cookie == acmd->cmd_ncookies) {
+			break;
+		}
+
+		ddi_dma_nextcookie(acmd->cmd_dmahandle,
+		    &acmd->cmd_dmacookies[i]);
+	}
+
+	acmd->cmd_cookiecnt = i;
+
+	if (bp->b_bcount >= acmd->cmd_dmacount) {
+		pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+	} else {
+		pkt->pkt_resid = 0;
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * build_cmd
+ */
+static struct drsas_cmd *
+build_cmd(struct drsas_instance *instance, struct scsi_address *ap,
+    struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+	uint16_t	flags = 0;
+	uint32_t	i;
+	uint32_t 	context;
+	uint32_t	sge_bytes;
+	ddi_acc_handle_t acc_handle;
+	struct drsas_cmd		*cmd;
+	struct drsas_sge64		*mfi_sgl;
+	struct scsa_cmd			*acmd = PKT2CMD(pkt);
+	struct drsas_pthru_frame 	*pthru;
+	struct drsas_io_frame		*ldio;
+
+	/* find out if this is logical or physical drive command.  */
+	acmd->islogical = MRDRV_IS_LOGICAL(ap);
+	acmd->device_id = MAP_DEVICE_ID(instance, ap);
+	*cmd_done = 0;
+
+	/* get the command packet */
+	if (!(cmd = get_mfi_pkt(instance))) {
+		return (NULL);
+	}
+
+	acc_handle = cmd->frame_dma_obj.acc_handle;
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index);
+
+	cmd->pkt = pkt;
+	cmd->cmd = acmd;
+
+	/* lets get the command directions */
+	if (acmd->cmd_flags & CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_WRITE;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORDEV);
+		}
+	} else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+		flags = MFI_FRAME_DIR_READ;
+
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORCPU);
+		}
+	} else {
+		flags = MFI_FRAME_DIR_NONE;
+	}
+
+	flags |= MFI_FRAME_SGL64;
+
+	switch (pkt->pkt_cdbp[0]) {
+
+	/*
+	 * case SCMD_SYNCHRONIZE_CACHE:
+	 * 	flush_cache(instance);
+	 *	return_mfi_pkt(instance, cmd);
+	 *	*cmd_done = 1;
+	 *
+	 *	return (NULL);
+	 */
+
+	case SCMD_READ:
+	case SCMD_WRITE:
+	case SCMD_READ_G1:
+	case SCMD_WRITE_G1:
+		if (acmd->islogical) {
+			ldio = (struct drsas_io_frame *)cmd->frame;
+
+			/*
+			 * preare the Logical IO frame:
+			 * 2nd bit is zero for all read cmds
+			 */
+			ddi_put8(acc_handle, &ldio->cmd,
+			    (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE
+			    : MFI_CMD_OP_LD_READ);
+			ddi_put8(acc_handle, &ldio->cmd_status, 0x0);
+			ddi_put8(acc_handle, &ldio->scsi_status, 0x0);
+			ddi_put8(acc_handle, &ldio->target_id, acmd->device_id);
+			ddi_put16(acc_handle, &ldio->timeout, 0);
+			ddi_put8(acc_handle, &ldio->reserved_0, 0);
+			ddi_put16(acc_handle, &ldio->pad_0, 0);
+			ddi_put16(acc_handle, &ldio->flags, flags);
+
+			/* Initialize sense Information */
+			bzero(cmd->sense, SENSE_LENGTH);
+			ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0);
+			ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo,
+			    cmd->sense_phys_addr);
+			ddi_put32(acc_handle, &ldio->start_lba_hi, 0);
+			ddi_put8(acc_handle, &ldio->access_byte,
+			    (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0);
+			ddi_put8(acc_handle, &ldio->sge_count,
+			    acmd->cmd_cookiecnt);
+			mfi_sgl = (struct drsas_sge64	*)&ldio->sgl;
+
+			context = ddi_get32(acc_handle, &ldio->context);
+
+			if (acmd->cmd_cdblen == CDB_GROUP0) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    (uint16_t)(pkt->pkt_cdbp[4])));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[3])) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+				    ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+				    << 16)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP1) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[8])) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP2) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[9])) |
+				    ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			} else if (acmd->cmd_cdblen == CDB_GROUP3) {
+				ddi_put32(acc_handle, &ldio->lba_count, (
+				    ((uint16_t)(pkt->pkt_cdbp[13])) |
+				    ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
+				    ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
+				    ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
+
+				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+			}
+
+			break;
+		}
+		/* fall through For all non-rd/wr cmds */
+	default:
+
+		switch (pkt->pkt_cdbp[0]) {
+		case SCMD_MODE_SENSE:
+		case SCMD_MODE_SENSE_G1: {
+			union scsi_cdb	*cdbp;
+			uint16_t	page_code;
+
+			cdbp = (void *)pkt->pkt_cdbp;
+			page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+			switch (page_code) {
+			case 0x3:
+			case 0x4:
+				(void) drsas_mode_sense_build(pkt);
+				return_mfi_pkt(instance, cmd);
+				*cmd_done = 1;
+				return (NULL);
+			}
+			break;
+		}
+		default:
+			break;
+		}
+
+		pthru	= (struct drsas_pthru_frame *)cmd->frame;
+
+		/* prepare the DCDB frame */
+		ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ?
+		    MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI);
+		ddi_put8(acc_handle, &pthru->cmd_status, 0x0);
+		ddi_put8(acc_handle, &pthru->scsi_status, 0x0);
+		ddi_put8(acc_handle, &pthru->target_id, acmd->device_id);
+		ddi_put8(acc_handle, &pthru->lun, 0);
+		ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
+		ddi_put16(acc_handle, &pthru->timeout, 0);
+		ddi_put16(acc_handle, &pthru->flags, flags);
+		ddi_put32(acc_handle, &pthru->data_xfer_len,
+		    acmd->cmd_dmacount);
+		ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
+		mfi_sgl			= (struct drsas_sge64 *)&pthru->sgl;
+
+		bzero(cmd->sense, SENSE_LENGTH);
+		ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+		ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo,
+		    cmd->sense_phys_addr);
+
+		context = ddi_get32(acc_handle, &pthru->context);
+		ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp,
+		    (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+		break;
+	}
+#ifdef lint
+	context = context;
+#endif
+	/* prepare the scatter-gather list for the firmware */
+	for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) {
+		ddi_put64(acc_handle, &mfi_sgl->phys_addr,
+		    acmd->cmd_dmacookies[i].dmac_laddress);
+		ddi_put32(acc_handle, &mfi_sgl->length,
+		    acmd->cmd_dmacookies[i].dmac_size);
+	}
+
+	sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt;
+
+	cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) +
+	    ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1;
+
+	if (cmd->frame_count >= 8) {
+		cmd->frame_count = 8;
+	}
+
+	return (cmd);
+}
+
+/*
+ * issue_mfi_pthru
+ */
+static int
+issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint_t		model;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	dma_obj_t			pthru_dma_obj;
+	struct drsas_pthru_frame	*kpthru;
+	struct drsas_pthru_frame	*pthru;
+	int i;
+	pthru = &cmd->frame->pthru;
+	kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+
+		xferlen	= kpthru->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+		xferlen	= kpthru->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+		xferlen	= kpthru->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
+#endif
+	}
+
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		pthru_dma_obj.size = xferlen;
+		pthru_dma_obj.dma_attr = drsas_generic_dma_attr;
+		pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		pthru_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		pthru_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &pthru_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kpthru->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf+i,
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
+	ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len);
+	ddi_put8(acc_handle, &pthru->cmd_status, 0);
+	ddi_put8(acc_handle, &pthru->scsi_status, 0);
+	ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
+	ddi_put8(acc_handle, &pthru->lun, kpthru->lun);
+	ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len);
+	ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count);
+	ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout);
+	ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
+
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+	/* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
+	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
+	    pthru->cdb_len, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_pthru: fw_ioctl failed"));
+	} else {
+		if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)pthru_dma_obj.buffer+i,
+				    (uint8_t *)ubuf+i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_pthru : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
+	kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
+
+	con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+	    "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_dcmd
+ */
+static int
+issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*ubuf;
+	uint32_t	kphys_addr = 0;
+	uint32_t	xferlen = 0;
+	uint32_t	model;
+	dma_obj_t	dcmd_dma_obj;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_dcmd_frame	*dcmd;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+	dcmd = &cmd->frame->dcmd;
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	if (xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		dcmd_dma_obj.size = xferlen;
+		dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+		dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		if (kdcmd->flags & MFI_FRAME_DIR_WRITE) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyin((uint8_t *)ubuf + i,
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy from user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address;
+	}
+
+	ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd);
+	ddi_put8(acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count);
+	ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout);
+	ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len);
+	ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b,
+	    (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen);
+	ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
+	} else {
+		if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) {
+			for (i = 0; i < xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)dcmd_dma_obj.buffer + i,
+				    (uint8_t *)ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_dcmd : "
+					    "copy to user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+
+	if (xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_smp
+ */
+static int
+issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*request_ubuf;
+	void		*response_ubuf;
+	uint32_t	request_xferlen = 0;
+	uint32_t	response_xferlen = 0;
+	uint_t		model;
+	dma_obj_t			request_dma_obj;
+	dma_obj_t			response_dma_obj;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	struct drsas_smp_frame		*ksmp;
+	struct drsas_smp_frame		*smp;
+	struct drsas_sge32		*sge32;
+#ifndef _ILP32
+	struct drsas_sge64		*sge64;
+#endif
+	int i;
+	uint64_t			tmp_sas_addr;
+
+	smp = &cmd->frame->smp;
+	ksmp = (struct drsas_smp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+		sge32			= &ksmp->sgl[0].sge32[0];
+		response_xferlen	= sge32[0].length;
+		request_xferlen		= sge32[1].length;
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		    "response_xferlen = %x, request_xferlen = %x",
+		    response_xferlen, request_xferlen));
+
+		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		    "response_ubuf = %p, request_ubuf = %p",
+		    response_ubuf, request_ubuf));
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+
+		sge64			= &ksmp->sgl[0].sge64[0];
+		response_xferlen	= sge64[0].length;
+		request_xferlen		= sge64[1].length;
+
+		response_ubuf	= (void *)(ulong_t)sge64[0].phys_addr;
+		request_ubuf	= (void *)(ulong_t)sge64[1].phys_addr;
+#endif
+	}
+	if (request_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		request_dma_obj.size = request_xferlen;
+		request_dma_obj.dma_attr = drsas_generic_dma_attr;
+		request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		request_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		request_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &request_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < request_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)request_ubuf + i,
+			    (uint8_t *)request_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (response_xferlen) {
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		response_dma_obj.size = response_xferlen;
+		response_dma_obj.dma_attr = drsas_generic_dma_attr;
+		response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		response_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		response_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &response_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < response_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)response_ubuf + i,
+			    (uint8_t *)response_dma_obj.buffer + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &smp->cmd, ksmp->cmd);
+	ddi_put8(acc_handle, &smp->cmd_status, 0);
+	ddi_put8(acc_handle, &smp->connection_status, 0);
+	ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count);
+	/* smp->context		= ksmp->context; */
+	ddi_put16(acc_handle, &smp->timeout, ksmp->timeout);
+	ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len);
+
+	bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr,
+	    sizeof (uint64_t));
+	ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr);
+
+	ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64);
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		sge32 = &smp->sgl[0].sge32[0];
+		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+		ddi_put32(acc_handle, &sge32[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+		ddi_put32(acc_handle, &sge32[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: DDI_MODEL_LP64"));
+		sge64 = &smp->sgl[0].sge64[0];
+		ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
+		ddi_put64(acc_handle, &sge64[0].phys_addr,
+		    response_dma_obj.dma_cookie[0].dmac_address);
+		ddi_put32(acc_handle, &sge64[1].length, request_xferlen);
+		ddi_put64(acc_handle, &sge64[1].phys_addr,
+		    request_dma_obj.dma_cookie[0].dmac_address);
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+	    "smp->response_xferlen = %d, smp->request_xferlen = %d "
+	    "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
+	    ddi_get32(acc_handle, &sge32[1].length),
+	    ddi_get32(acc_handle, &smp->data_xfer_len)));
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN,
+		    "issue_mfi_smp: fw_ioctl failed"));
+	} else {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "issue_mfi_smp: copy to user space"));
+
+		if (request_xferlen) {
+			for (i = 0; i < request_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)request_dma_obj.buffer +
+				    i, (uint8_t *)request_ubuf + i,
+				    1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to user space"
+					    " failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+
+		if (response_xferlen) {
+			for (i = 0; i < response_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)response_dma_obj.buffer
+				    + i, (uint8_t *)response_ubuf
+				    + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_smp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+
+	ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
+	    ddi_get8(acc_handle, &smp->cmd_status)));
+
+
+	if (request_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, request_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (response_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, response_dma_obj) !=
+		    DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_stp
+ */
+static int
+issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    struct drsas_cmd *cmd, int mode)
+{
+	void		*fis_ubuf;
+	void		*data_ubuf;
+	uint32_t	fis_xferlen = 0;
+	uint32_t	data_xferlen = 0;
+	uint_t		model;
+	dma_obj_t	fis_dma_obj;
+	dma_obj_t	data_dma_obj;
+	struct drsas_stp_frame	*kstp;
+	struct drsas_stp_frame	*stp;
+	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
+	int i;
+
+	stp = &cmd->frame->stp;
+	kstp = (struct drsas_stp_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+	}
+	else
+	{
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+		fis_xferlen	= kstp->sgl.sge32[0].length;
+		data_xferlen	= kstp->sgl.sge32[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+
+		fis_xferlen	= kstp->sgl.sge64[0].length;
+		data_xferlen	= kstp->sgl.sge64[1].length;
+
+		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr;
+		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr;
+#endif
+	}
+
+
+	if (fis_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+		    "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		fis_dma_obj.size = fis_xferlen;
+		fis_dma_obj.dma_attr = drsas_generic_dma_attr;
+		fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_count_max	= 0xFFFFFFFFU;
+		fis_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		fis_dma_obj.dma_attr.dma_attr_align = 1;
+
+		/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &fis_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < fis_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)fis_ubuf + i,
+			    (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	if (data_xferlen) {
+		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+		    "data_xferlen = %x", data_ubuf, data_xferlen));
+
+		/* means IOCTL requires DMA */
+		/* allocate the data transfer buffer */
+		data_dma_obj.size = data_xferlen;
+		data_dma_obj.dma_attr = drsas_generic_dma_attr;
+		data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+		data_dma_obj.dma_attr.dma_attr_sgllen = 1;
+		data_dma_obj.dma_attr.dma_attr_align = 1;
+
+/* allocate kernel buffer for DMA */
+		if (drsas_alloc_dma_obj(instance, &data_dma_obj,
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+			    "could not allocate data transfer buffer."));
+			return (DDI_FAILURE);
+		}
+
+		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyin((uint8_t *)data_ubuf + i,
+			    (uint8_t *)data_dma_obj.buffer + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+				    "copy from user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	ddi_put8(acc_handle, &stp->cmd, kstp->cmd);
+	ddi_put8(acc_handle, &stp->cmd_status, 0);
+	ddi_put8(acc_handle, &stp->connection_status, 0);
+	ddi_put8(acc_handle, &stp->target_id, kstp->target_id);
+	ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count);
+
+	ddi_put16(acc_handle, &stp->timeout, kstp->timeout);
+	ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len);
+
+	ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10,
+	    DDI_DEV_AUTOINCR);
+
+	ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64);
+	ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr,
+	    fis_dma_obj.dma_cookie[0].dmac_address);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen);
+	ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr,
+	    data_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = DRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+		con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
+	} else {
+
+		if (fis_xferlen) {
+			for (i = 0; i < fis_xferlen; i++) {
+				if (ddi_copyout(
+				    (uint8_t *)fis_dma_obj.buffer + i,
+				    (uint8_t *)fis_ubuf + i, 1, mode)) {
+					con_log(CL_ANN, (CE_WARN,
+					    "issue_mfi_stp : copy to "
+					    "user space failed"));
+					return (DDI_FAILURE);
+				}
+			}
+		}
+	}
+	if (data_xferlen) {
+		for (i = 0; i < data_xferlen; i++) {
+			if (ddi_copyout(
+			    (uint8_t *)data_dma_obj.buffer + i,
+			    (uint8_t *)data_ubuf + i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "issue_mfi_stp : copy to"
+				    " user space failed"));
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+
+	if (fis_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	if (data_xferlen) {
+		/* free kernel buffer */
+		if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * fill_up_drv_ver
+ */
+static void
+fill_up_drv_ver(struct drsas_drv_ver *dv)
+{
+	(void) memset(dv, 0, sizeof (struct drsas_drv_ver));
+
+	(void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$"));
+	(void) memcpy(dv->os_name, "Solaris", strlen("Solaris"));
+	(void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas"));
+	(void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION));
+	(void) memcpy(dv->drv_rel_date, DRSAS_RELDATE,
+	    strlen(DRSAS_RELDATE));
+}
+
+/*
+ * handle_drv_ioctl
+ */
+static int
+handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	i;
+	int	rval = DDI_SUCCESS;
+	int	*props = NULL;
+	void	*ubuf;
+
+	uint8_t		*pci_conf_buf;
+	uint32_t	xferlen;
+	uint32_t	num_props;
+	uint_t		model;
+	struct drsas_dcmd_frame	*kdcmd;
+	struct drsas_drv_ver	dv;
+	struct drsas_pci_information pi;
+
+	kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+	model = ddi_model_convert_from(mode & FMODELS);
+	if (model == DDI_MODEL_ILP32) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+		xferlen	= kdcmd->sgl.sge32[0].length;
+
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+	} else {
+#ifdef _ILP32
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
+		xferlen	= kdcmd->sgl.sge32[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+		con_log(CL_ANN1, (CE_NOTE,
+		    "handle_drv_ioctl: DDI_MODEL_LP64"));
+		xferlen	= kdcmd->sgl.sge64[0].length;
+		ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+	}
+	con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+	    "dataBuf=%p size=%d bytes", ubuf, xferlen));
+
+	switch (kdcmd->opcode) {
+	case DRSAS_DRIVER_IOCTL_DRIVER_VERSION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
+
+		fill_up_drv_ver(&dv);
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+		break;
+	case DRSAS_DRIVER_IOCTL_PCI_INFORMATION:
+		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		    "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON"));
+
+		if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip,
+		    0, "reg", &props, &num_props)) {
+			con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+			    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : "
+			    "ddi_prop_look_int_array failed"));
+			rval = DDI_FAILURE;
+		} else {
+
+			pi.busNumber = (props[0] >> 16) & 0xFF;
+			pi.deviceNumber = (props[0] >> 11) & 0x1f;
+			pi.functionNumber = (props[0] >> 8) & 0x7;
+			ddi_prop_free((void *)props);
+		}
+
+		pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo;
+
+		for (i = 0; i < (sizeof (struct drsas_pci_information) -
+		    offsetof(struct drsas_pci_information, pciHeaderInfo));
+		    i++) {
+			pci_conf_buf[i] =
+			    pci_config_get8(instance->pci_handle, i);
+		}
+		for (i = 0; i < xferlen; i++) {
+			if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i,
+			    1, mode)) {
+				con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+				    "DRSAS_DRIVER_IOCTL_PCI_INFORMATION"
+				    " : copy to user space failed"));
+				kdcmd->cmd_status = 1;
+				rval = DDI_FAILURE;
+				break;
+			}
+		}
+
+		if (i == xferlen)
+			kdcmd->cmd_status = 0;
+
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+		    "invalid driver specific IOCTL opcode = 0x%x",
+		    kdcmd->opcode));
+		kdcmd->cmd_status = 1;
+		rval = DDI_FAILURE;
+		break;
+	}
+
+	return (rval);
+}
+
+/*
+ * handle_mfi_ioctl
+ */
+static int
+handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+    int mode)
+{
+	int	rval = DDI_SUCCESS;
+
+	struct drsas_header	*hdr;
+	struct drsas_cmd	*cmd;
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN, (CE_WARN, "dr_sas: "
+		    "failed to get a cmd packet"));
+		return (DDI_FAILURE);
+	}
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	hdr = (struct drsas_header *)&ioctl->frame[0];
+
+	switch (hdr->cmd) {
+	case MFI_CMD_OP_DCMD:
+		rval = issue_mfi_dcmd(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_SMP:
+		rval = issue_mfi_smp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_STP:
+		rval = issue_mfi_stp(instance, ioctl, cmd, mode);
+		break;
+	case MFI_CMD_OP_LD_SCSI:
+	case MFI_CMD_OP_PD_SCSI:
+		rval = issue_mfi_pthru(instance, ioctl, cmd, mode);
+		break;
+	default:
+		con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: "
+		    "invalid mfi ioctl hdr->cmd = %d", hdr->cmd));
+		rval = DDI_FAILURE;
+		break;
+	}
+
+
+	return_mfi_pkt(instance, cmd);
+	if (drsas_common_check(instance, cmd) != DDI_SUCCESS)
+		rval = DDI_FAILURE;
+	return (rval);
+}
+
+/*
+ * AEN
+ */
+static int
+handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen)
+{
+	int	rval = 0;
+
+	rval = register_mfi_aen(instance, instance->aen_seq_num,
+	    aen->class_locale_word);
+
+	aen->cmd_status = (uint8_t)rval;
+
+	return (rval);
+}
+
+static int
+register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num,
+    uint32_t class_locale_word)
+{
+	int	ret_val;
+
+	struct drsas_cmd	*cmd, *aen_cmd;
+	struct drsas_dcmd_frame	*dcmd;
+	union drsas_evt_class_locale	curr_aen;
+	union drsas_evt_class_locale	prev_aen;
+
+	/*
+	 * If there an AEN pending already (aen_cmd), check if the
+	 * class_locale of that pending AEN is inclusive of the new
+	 * AEN request we currently have. If it is, then we don't have
+	 * to do anything. In other words, whichever events the current
+	 * AEN request is subscribing to, have already been subscribed
+	 * to.
+	 *
+	 * If the old_cmd is _not_ inclusive, then we have to abort
+	 * that command, form a class_locale that is superset of both
+	 * old and current and re-issue to the FW
+	 */
+
+	curr_aen.word = class_locale_word;
+	aen_cmd = instance->aen_cmd;
+	if (aen_cmd) {
+		prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle,
+		    &aen_cmd->frame->dcmd.mbox.w[1]);
+
+		/*
+		 * A class whose enum value is smaller is inclusive of all
+		 * higher values. If a PROGRESS (= -1) was previously
+		 * registered, then a new registration requests for higher
+		 * classes need not be sent to FW. They are automatically
+		 * included.
+		 *
+		 * Locale numbers don't have such hierarchy. They are bitmap
+		 * values
+		 */
+		if ((prev_aen.members.class <= curr_aen.members.class) &&
+		    !((prev_aen.members.locale & curr_aen.members.locale) ^
+		    curr_aen.members.locale)) {
+			/*
+			 * Previously issued event registration includes
+			 * current request. Nothing to do.
+			 */
+
+			return (0);
+		} else {
+			curr_aen.members.locale |= prev_aen.members.locale;
+
+			if (prev_aen.members.class < curr_aen.members.class)
+				curr_aen.members.class = prev_aen.members.class;
+
+			ret_val = abort_aen_cmd(instance, aen_cmd);
+
+			if (ret_val) {
+				con_log(CL_ANN, (CE_WARN, "register_mfi_aen: "
+				    "failed to abort prevous AEN command"));
+
+				return (ret_val);
+			}
+		}
+	} else {
+		curr_aen.word = class_locale_word;
+	}
+
+	cmd = get_mfi_pkt(instance);
+
+	if (!cmd)
+		return (ENOMEM);
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	dcmd = &cmd->frame->dcmd;
+
+	/* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	(void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+	    sizeof (struct drsas_evt_detail));
+
+	/* Prepare DCMD for aen registration */
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct drsas_evt_detail));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    DR_DCMD_CTRL_EVENT_WAIT);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1],
+	    curr_aen.word);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct drsas_evt_detail));
+
+	instance->aen_seq_num = seq_num;
+
+
+	/*
+	 * Store reference to the cmd used to register for AEN. When an
+	 * application wants us to register for AEN, we have to abort this
+	 * cmd and re-register with a new EVENT LOCALE supplied by that app
+	 */
+	instance->aen_cmd = cmd;
+
+	cmd->frame_count = 1;
+
+	/* Issue the aen registration frame */
+	/* atomic_add_16 (&instance->fw_outstanding, 1); */
+	instance->func_ptr->issue_cmd(cmd, instance);
+
+	return (0);
+}
+
+static void
+display_scsi_inquiry(caddr_t scsi_inq)
+{
+#define	MAX_SCSI_DEVICE_CODE	14
+	int		i;
+	char		inquiry_buf[256] = {0};
+	int		len;
+	const char	*const scsi_device_types[] = {
+		"Direct-Access    ",
+		"Sequential-Access",
+		"Printer          ",
+		"Processor        ",
+		"WORM             ",
+		"CD-ROM           ",
+		"Scanner          ",
+		"Optical Device   ",
+		"Medium Changer   ",
+		"Communications   ",
+		"Unknown          ",
+		"Unknown          ",
+		"Unknown          ",
+		"Enclosure        ",
+	};
+
+	len = 0;
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Vendor: ");
+	for (i = 8; i < 16; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Model: ");
+
+	for (i = 16; i < 32; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Rev: ");
+
+	for (i = 32; i < 36; i++) {
+		len += snprintf(inquiry_buf + len, 265 - len, "%c",
+		    scsi_inq[i]);
+	}
+
+	len += snprintf(inquiry_buf + len, 265 - len, "\n");
+
+
+	i = scsi_inq[0] & 0x1f;
+
+
+	len += snprintf(inquiry_buf + len, 265 - len, "  Type:   %s ",
+	    i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
+	    "Unknown          ");
+
+
+	len += snprintf(inquiry_buf + len, 265 - len,
+	    "                 ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+
+	if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
+		len += snprintf(inquiry_buf + len, 265 - len, " CCS\n");
+	} else {
+		len += snprintf(inquiry_buf + len, 265 - len, "\n");
+	}
+
+	con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+}
+
+static int
+read_fw_status_reg_ppc(struct drsas_instance *instance)
+{
+	return ((int)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+static void
+issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance)
+{
+	atomic_add_16(&instance->fw_outstanding, 1);
+
+	/* Issue the command to the FW */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+static int
+issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called"));
+
+	cmd->cmd_status	= ENODATA;
+
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	mutex_enter(&instance->int_cmd_mtx);
+
+	for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+		cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+	}
+
+	mutex_exit(&instance->int_cmd_mtx);
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
+
+	if (i < (msecs -1)) {
+		return (DDI_SUCCESS);
+	} else {
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+static int
+issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance,
+    struct drsas_cmd *cmd)
+{
+	int		i;
+	uint16_t	flags;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+	struct drsas_header *frame_hdr;
+
+	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called"));
+
+	frame_hdr = (struct drsas_header *)cmd->frame;
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+	flags 	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+	/* issue the frame using inbound queue port */
+	WR_IB_QPORT((cmd->frame_phys_addr) |
+	    (((cmd->frame_count - 1) << 1) | 1), instance);
+
+	/* wait for cmd_status to change from 0xFF */
+	for (i = 0; i < msecs && (
+	    ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE); i++) {
+		drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+	}
+
+	if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE) {
+		con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
+		    "cmd polling timed out"));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static void
+enable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask;
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called"));
+
+	/* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */
+	WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance);
+
+	/* WR_OB_INTR_MASK(~0x80000000, instance); */
+	WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance);
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: "
+	    "outbound_intr_mask = 0x%x", mask));
+}
+
+static void
+disable_intr_ppc(struct drsas_instance *instance)
+{
+	uint32_t	mask;
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called"));
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */
+	WR_OB_INTR_MASK(OB_INTR_MASK, instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : "
+	    "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+	/* dummy read to force PCI flush */
+	mask = RD_OB_INTR_MASK(instance);
+#ifdef lint
+	mask = mask;
+#endif
+}
+
+static int
+intr_ack_ppc(struct drsas_instance *instance)
+{
+	uint32_t	status;
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called"));
+
+	/* check if it is our interrupt */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status));
+
+	if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	/* clear the interrupt by writing back the same value */
+	WR_OB_DOORBELL_CLEAR(status, instance);
+
+	/* dummy READ */
+	status = RD_OB_INTR_STATUS(instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared"));
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static int
+drsas_common_check(struct drsas_instance *instance,
+    struct  drsas_cmd *cmd)
+{
+	int ret = DDI_SUCCESS;
+
+	if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
+	    DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+	if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+
+		ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
+
+		if (cmd->pkt != NULL) {
+			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+			cmd->pkt->pkt_statistics = 0;
+		}
+		ret = DDI_FAILURE;
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+	/*
+	 * as the driver can always deal with an error in any dma or
+	 * access handle, we can just return the fme_status value.
+	 */
+	pci_ereport_post(dip, err, NULL);
+	return (err->fme_status);
+}
+
+static void
+drsas_fm_init(struct drsas_instance *instance)
+{
+	/* Need to change iblock to priority for new MSI intr */
+	ddi_iblock_cookie_t fm_ibc;
+
+	/* Only register with IO Fault Services if we have some capability */
+	if (instance->fm_capabilities) {
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR;
+
+		/*
+		 * Register capabilities with IO Fault Services.
+		 * fm_capabilities will be updated to indicate
+		 * capabilities actually supported (not requested.)
+		 */
+
+		ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc);
+
+		/*
+		 * Initialize pci ereport capabilities if ereport
+		 * capable (should always be.)
+		 */
+
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_setup(instance->dip);
+		}
+
+		/*
+		 * Register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_register(instance->dip,
+			    drsas_fm_error_cb, (void*) instance);
+		}
+	} else {
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+static void
+drsas_fm_fini(struct drsas_instance *instance)
+{
+	/* Only unregister FMA capabilities if registered */
+	if (instance->fm_capabilities) {
+		/*
+		 * Un-register error callback if error callback capable.
+		 */
+		if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			ddi_fm_handler_unregister(instance->dip);
+		}
+
+		/*
+		 * Release any resources allocated by pci_ereport_setup()
+		 */
+		if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+		    DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+			pci_ereport_teardown(instance->dip);
+		}
+
+		/* Unregister from IO Fault Services */
+		ddi_fm_fini(instance->dip);
+
+		/* Adjust access and dma attributes for FMA */
+		endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+		drsas_generic_dma_attr.dma_attr_flags = 0;
+	}
+}
+
+int
+drsas_check_acc_handle(ddi_acc_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+int
+drsas_check_dma_handle(ddi_dma_handle_t handle)
+{
+	ddi_fm_error_t de;
+
+	if (handle == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+
+	return (de.fme_status);
+}
+
+void
+drsas_fm_ereport(struct drsas_instance *instance, char *detail)
+{
+	uint64_t ena;
+	char buf[FM_MAX_CLASS];
+
+	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+	ena = fm_ena_generate(0, FM_ENA_FMT1);
+	if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) {
+		ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP,
+		    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL);
+	}
+}
+
+static int
+drsas_add_intrs(struct drsas_instance *instance, int intr_type)
+{
+
+	dev_info_t *dip = instance->dip;
+	int	avail, actual, count;
+	int	i, flag, ret;
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x",
+	    intr_type));
+
+	/* Get number of interrupts */
+	ret = ddi_intr_get_nintrs(dip, intr_type, &count);
+	if ((ret != DDI_SUCCESS) || (count == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:"
+		    "ret %d count %d", ret, count));
+
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count));
+
+	/* Get number of available interrupts */
+	ret = ddi_intr_get_navail(dip, intr_type, &avail);
+	if ((ret != DDI_SUCCESS) || (avail == 0)) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:"
+		    "ret %d avail %d", ret, avail));
+
+		return (DDI_FAILURE);
+	}
+	con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail));
+
+	/* Only one interrupt routine. So limit the count to 1 */
+	if (count > 1) {
+		count = 1;
+	}
+
+	/*
+	 * Allocate an array of interrupt handlers. Currently we support
+	 * only one interrupt. The framework can be extended later.
+	 */
+	instance->intr_size = count * sizeof (ddi_intr_handle_t);
+	instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
+	ASSERT(instance->intr_htable);
+
+	flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
+	    DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+
+	/* Allocate interrupt */
+	ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
+	    count, &actual, flag);
+
+	if ((ret != DDI_SUCCESS) || (actual == 0)) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "avail = %d", avail));
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+	if (actual < count) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "Requested = %d  Received = %d", count, actual));
+	}
+	instance->intr_cnt = actual;
+
+	/*
+	 * Get the priority of the interrupt allocated.
+	 */
+	if ((ret = ddi_intr_get_pri(instance->intr_htable[0],
+	    &instance->intr_pri)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "get priority call failed"));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Test for high level mutex. we don't support them.
+	 */
+	if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
+		con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+		    "High level interrupts not supported."));
+
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ",
+	    instance->intr_pri));
+
+	/* Call ddi_intr_add_handler() */
+	for (i = 0; i < actual; i++) {
+		ret = ddi_intr_add_handler(instance->intr_htable[i],
+		    (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance,
+		    (caddr_t)(uintptr_t)i);
+
+		if (ret != DDI_SUCCESS) {
+			con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:"
+			    "failed %d", ret));
+
+			for (i = 0; i < actual; i++) {
+				(void) ddi_intr_free(instance->intr_htable[i]);
+			}
+			kmem_free(instance->intr_htable, instance->intr_size);
+			return (DDI_FAILURE);
+		}
+
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+
+	if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
+	    &instance->intr_cap)) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
+		    ret));
+
+		/* Free already allocated intr */
+		for (i = 0; i < actual; i++) {
+			(void) ddi_intr_remove_handler(
+			    instance->intr_htable[i]);
+			(void) ddi_intr_free(instance->intr_htable[i]);
+		}
+		kmem_free(instance->intr_htable, instance->intr_size);
+		return (DDI_FAILURE);
+	}
+
+	if (instance->intr_cap &  DDI_INTR_FLAG_BLOCK) {
+		con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable"));
+
+		(void) ddi_intr_block_enable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable"));
+
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_enable(instance->intr_htable[i]);
+			con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns "
+			    "%d", i));
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+}
+
+
+static void
+drsas_rem_intrs(struct drsas_instance *instance)
+{
+	int i;
+
+	con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called"));
+
+	/* Disable all interrupts first */
+	if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+		(void) ddi_intr_block_disable(instance->intr_htable,
+		    instance->intr_cnt);
+	} else {
+		for (i = 0; i < instance->intr_cnt; i++) {
+			(void) ddi_intr_disable(instance->intr_htable[i]);
+		}
+	}
+
+	/* Remove all the handlers */
+
+	for (i = 0; i < instance->intr_cnt; i++) {
+		(void) ddi_intr_remove_handler(instance->intr_htable[i]);
+		(void) ddi_intr_free(instance->intr_htable[i]);
+	}
+
+	kmem_free(instance->intr_htable, instance->intr_size);
+}
+
+static int
+drsas_tran_bus_config(dev_info_t *parent, uint_t flags,
+    ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+	struct drsas_instance *instance;
+	int config;
+	int rval;
+
+	char *ptr = NULL;
+	int tgt, lun;
+
+	con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op));
+
+	if ((instance = ddi_get_soft_state(drsas_state,
+	    ddi_get_instance(parent))) == NULL) {
+		return (NDI_FAILURE);
+	}
+
+	/* Hold nexus during bus_config */
+	ndi_devi_enter(parent, &config);
+	switch (op) {
+	case BUS_CONFIG_ONE: {
+
+		/* parse wwid/target name out of name given */
+		if ((ptr = strchr((char *)arg, '@')) == NULL) {
+			rval = NDI_FAILURE;
+			break;
+		}
+		ptr++;
+
+		if (drsas_parse_devname(arg, &tgt, &lun) != 0) {
+			rval = NDI_FAILURE;
+			break;
+		}
+
+		if (lun == 0) {
+			rval = drsas_config_ld(instance, tgt, lun, childp);
+		} else {
+			rval = NDI_FAILURE;
+		}
+
+		break;
+	}
+	case BUS_CONFIG_DRIVER:
+	case BUS_CONFIG_ALL: {
+
+		rval = drsas_config_all_devices(instance);
+
+		rval = NDI_SUCCESS;
+		break;
+	}
+	}
+
+	if (rval == NDI_SUCCESS) {
+		rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+	}
+	ndi_devi_exit(parent, config);
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_all_devices(struct drsas_instance *instance)
+{
+	int rval, tgt;
+
+	for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+		(void) drsas_config_ld(instance, tgt, 0, NULL);
+
+	}
+
+	rval = NDI_SUCCESS;
+	return (rval);
+}
+
+static int
+drsas_parse_devname(char *devnm, int *tgt, int *lun)
+{
+	char devbuf[SCSI_MAXNAMELEN];
+	char *addr;
+	char *p,  *tp, *lp;
+	long num;
+
+	/* Parse dev name and address */
+	(void) strcpy(devbuf, devnm);
+	addr = "";
+	for (p = devbuf; *p != '\0'; p++) {
+		if (*p == '@') {
+			addr = p + 1;
+			*p = '\0';
+		} else if (*p == ':') {
+			*p = '\0';
+			break;
+		}
+	}
+
+	/* Parse target and lun */
+	for (p = tp = addr, lp = NULL; *p != '\0'; p++) {
+		if (*p == ',') {
+			lp = p + 1;
+			*p = '\0';
+			break;
+		}
+	}
+	if (tgt && tp) {
+		if (ddi_strtol(tp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE); /* Can declare this as constant */
+		}
+			*tgt = (int)num;
+	}
+	if (lun && lp) {
+		if (ddi_strtol(lp, NULL, 0x10, &num)) {
+			return (DDI_FAILURE);
+		}
+			*lun = (int)num;
+	}
+	return (DDI_SUCCESS);  /* Success case */
+}
+
+static int
+drsas_config_ld(struct drsas_instance *instance, uint16_t tgt,
+    uint8_t lun, dev_info_t **ldip)
+{
+	struct scsi_device *sd;
+	dev_info_t *child;
+	int rval;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d",
+	    tgt, lun));
+
+	if ((child = drsas_find_child(instance, tgt, lun)) != NULL) {
+		if (ldip) {
+			*ldip = child;
+		}
+		con_log(CL_ANN1, (CE_NOTE,
+		    "drsas_config_ld: Child = %p found t = %d l = %d",
+		    (void *)child, tgt, lun));
+		return (NDI_SUCCESS);
+	}
+
+	sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+	sd->sd_address.a_hba_tran = instance->tran;
+	sd->sd_address.a_target = (uint16_t)tgt;
+	sd->sd_address.a_lun = (uint8_t)lun;
+
+	if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS)
+		rval = drsas_config_scsi_device(instance, sd, ldip);
+	else
+		rval = NDI_FAILURE;
+
+	/* sd_unprobe is blank now. Free buffer manually */
+	if (sd->sd_inq) {
+		kmem_free(sd->sd_inq, SUN_INQSIZE);
+		sd->sd_inq = (struct scsi_inquiry *)NULL;
+	}
+
+	kmem_free(sd, sizeof (struct scsi_device));
+	con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d",
+	    rval));
+	return (rval);
+}
+
+static int
+drsas_config_scsi_device(struct drsas_instance *instance,
+    struct scsi_device *sd, dev_info_t **dipp)
+{
+	char *nodename = NULL;
+	char **compatible = NULL;
+	int ncompatible = 0;
+	char *childname;
+	dev_info_t *ldip = NULL;
+	int tgt = sd->sd_address.a_target;
+	int lun = sd->sd_address.a_lun;
+	int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
+	int rval;
+
+	con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun));
+	scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
+	    NULL, &nodename, &compatible, &ncompatible);
+
+	if (nodename == NULL) {
+		con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver "
+		    "for t%dL%d", tgt, lun));
+		rval = NDI_FAILURE;
+		goto finish;
+	}
+
+	childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas: Childname = %2s nodename = %s", childname, nodename));
+
+	/* Create a dev node */
+	rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
+	con_log(CL_ANN1, (CE_WARN,
+	    "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
+	if (rval == NDI_SUCCESS) {
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d target", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d lun", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip,
+		    "compatible", compatible, ncompatible) !=
+		    DDI_PROP_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+			    "property for t%dl%d compatible", tgt, lun));
+			rval = NDI_FAILURE;
+			goto finish;
+		}
+
+		rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH);
+		if (rval != NDI_SUCCESS) {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online "
+			    "t%dl%d", tgt, lun));
+			ndi_prop_remove_all(ldip);
+			(void) ndi_devi_free(ldip);
+		} else {
+			con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :"
+			    "0 t%dl%d", tgt, lun));
+		}
+
+	}
+finish:
+	if (dipp) {
+		*dipp = ldip;
+	}
+
+	con_log(CL_DLEVEL1, (CE_WARN,
+	    "dr_sas: config_scsi_device rval = %d t%dL%d",
+	    rval, tgt, lun));
+	scsi_hba_nodename_compatible_free(nodename, compatible);
+	return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event,
+    uint64_t wwn)
+{
+	struct drsas_eventinfo *mrevt = NULL;
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "drsas_service_evt called for t%dl%d event = %d",
+	    tgt, lun, event));
+
+	if ((instance->taskq == NULL) || (mrevt =
+	    kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) {
+		return (ENOMEM);
+	}
+
+	mrevt->instance = instance;
+	mrevt->tgt = tgt;
+	mrevt->lun = lun;
+	mrevt->event = event;
+
+	if ((ddi_taskq_dispatch(instance->taskq,
+	    (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
+	    DDI_SUCCESS) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "dr_sas: Event task failed for t%dl%d event = %d",
+		    tgt, lun, event));
+		kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+		return (DDI_FAILURE);
+	}
+	return (DDI_SUCCESS);
+}
+
+static void
+drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt)
+{
+	struct drsas_instance *instance = mrevt->instance;
+	dev_info_t *dip, *pdip;
+	int circ1 = 0;
+	char *devname;
+
+	con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for"
+	    " tgt %d lun %d event %d",
+	    mrevt->tgt, mrevt->lun, mrevt->event));
+
+	if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+		dip = instance->dr_ld_list[mrevt->tgt].dip;
+	} else {
+		return;
+	}
+
+	ndi_devi_enter(instance->dip, &circ1);
+	switch (mrevt->event) {
+	case DRSAS_EVT_CONFIG_TGT:
+		if (dip == NULL) {
+
+			if (mrevt->lun == 0) {
+				(void) drsas_config_ld(instance, mrevt->tgt,
+				    0, NULL);
+			}
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_CONFIG_TGT dip != NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	case DRSAS_EVT_UNCONFIG_TGT:
+		if (dip) {
+			if (i_ddi_devi_attached(dip)) {
+
+				pdip = ddi_get_parent(dip);
+
+				devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP);
+				(void) ddi_deviname(dip, devname);
+
+				(void) devfs_clean(pdip, devname + 1,
+				    DV_CLEAN_FORCE);
+				kmem_free(devname, MAXNAMELEN + 1);
+			}
+			(void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT called:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		} else {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "dr_sas: EVT_UNCONFIG_TGT dip == NULL:"
+			    " for tgt %d lun %d event %d",
+			    mrevt->tgt, mrevt->lun, mrevt->event));
+		}
+		break;
+	}
+	kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+	ndi_devi_exit(instance->dip, circ1);
+}
+
+static int
+drsas_mode_sense_build(struct scsi_pkt *pkt)
+{
+	union scsi_cdb		*cdbp;
+	uint16_t 		page_code;
+	struct scsa_cmd		*acmd;
+	struct buf		*bp;
+	struct mode_header	*modehdrp;
+
+	cdbp = (void *)pkt->pkt_cdbp;
+	page_code = cdbp->cdb_un.sg.scsi[0];
+	acmd = PKT2CMD(pkt);
+	bp = acmd->cmd_buf;
+	if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) {
+		con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command"));
+		/* ADD pkt statistics as Command failed. */
+		return (NULL);
+	}
+
+	bp_mapin(bp);
+	bzero(bp->b_un.b_addr, bp->b_bcount);
+
+	switch (page_code) {
+		case 0x3: {
+			struct mode_format *page3p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page3p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page3p->mode_page.code = 0x3;
+			page3p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_format));
+			page3p->data_bytes_sect = 512;
+			page3p->sect_track = 63;
+			break;
+		}
+		case 0x4: {
+			struct mode_geometry *page4p = NULL;
+			modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+			modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+			page4p = (void *)((caddr_t)modehdrp +
+			    MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+			page4p->mode_page.code = 0x4;
+			page4p->mode_page.length =
+			    (uchar_t)(sizeof (struct mode_geometry));
+			page4p->heads = 255;
+			page4p->rpm = 10000;
+			break;
+		}
+		default:
+			break;
+	}
+	return (NULL);
+}
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf
new file mode 100644
index 0000000000..3792f43ca4
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2008-2009, LSI Logic Corporation.
+# All rights reserved.
+#
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# dr_sas.conf for sol 10 (and later) for all supported architectures
+#
+# global definitions
+
+# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI
+#drsas-enable-msi="yes";
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h
new file mode 100644
index 0000000000..8f78658edf
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.h
@@ -0,0 +1,1766 @@
+/*
+ * dr_sas.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_H_
+#define	_DR_SAS_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/scsi/scsi.h>
+#include "dr_sas_list.h"
+
+/*
+ * MegaRAID SAS2.0 Driver meta data
+ */
+#define	DRSAS_VERSION				"LSIv2.0"
+#define	DRSAS_RELDATE				"Jan 9, 2009"
+
+#define	DRSAS_TRUE				1
+#define	DRSAS_FALSE				0
+
+/*
+ * MegaRAID SAS2.0 device id conversion definitions.
+ */
+#define	INST2LSIRDCTL(x)		((x) << INST_MINOR_SHIFT)
+
+/*
+ * MegaRAID SAS2.0 supported controllers
+ */
+#define	PCI_DEVICE_ID_LSI_2108VDE		0x0078
+#define	PCI_DEVICE_ID_LSI_2108V			0x0079
+
+/*
+ * Register Index for 2108 Controllers.
+ */
+#define	REGISTER_SET_IO_2108			(2)
+
+#define	DRSAS_MAX_SGE_CNT			0x50
+
+#define	DRSAS_IOCTL_DRIVER			0x12341234
+#define	DRSAS_IOCTL_FIRMWARE			0x12345678
+#define	DRSAS_IOCTL_AEN				0x87654321
+
+#define	DRSAS_1_SECOND				1000000
+
+/* Dynamic Enumeration Flags */
+#define	DRSAS_PD_LUN		1
+#define	DRSAS_LD_LUN		0
+#define	DRSAS_PD_TGT_MAX	255
+#define	DRSAS_GET_PD_MAX(s)	((s)->dr_pd_max)
+#define	WWN_STRLEN		17
+
+/*
+ * =====================================
+ * MegaRAID SAS2.0 MFI firmware definitions
+ * =====================================
+ */
+/*
+ * MFI stands for  MegaRAID SAS2.0 FW Interface. This is just a moniker for
+ * protocol between the software and firmware. Commands are issued using
+ * "message frames"
+ */
+
+/*
+ * FW posts its state in upper 4 bits of outbound_msg_0 register
+ */
+#define	MFI_STATE_SHIFT 			28
+#define	MFI_STATE_MASK				((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define	MFI_STATE_UNDEFINED			((uint32_t)0x0<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BB_INIT			((uint32_t)0x1<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT			((uint32_t)0x4<<MFI_STATE_SHIFT)
+#define	MFI_STATE_WAIT_HANDSHAKE		((uint32_t)0x6<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FW_INIT_2			((uint32_t)0x7<<MFI_STATE_SHIFT)
+#define	MFI_STATE_DEVICE_SCAN			((uint32_t)0x8<<MFI_STATE_SHIFT)
+#define	MFI_STATE_BOOT_MESSAGE_PENDING		((uint32_t)0x9<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FLUSH_CACHE			((uint32_t)0xA<<MFI_STATE_SHIFT)
+#define	MFI_STATE_READY				((uint32_t)0xB<<MFI_STATE_SHIFT)
+#define	MFI_STATE_OPERATIONAL			((uint32_t)0xC<<MFI_STATE_SHIFT)
+#define	MFI_STATE_FAULT				((uint32_t)0xF<<MFI_STATE_SHIFT)
+
+#define	MRMFI_FRAME_SIZE			64
+
+/*
+ * During FW init, clear pending cmds & reset state using inbound_msg_0
+ *
+ * ABORT	: Abort all pending cmds
+ * READY	: Move from OPERATIONAL to READY state; discard queue info
+ * MFIMODE	: Discard (possible) low MFA posted in 64-bit mode (??)
+ * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver
+ */
+#define	MFI_INIT_ABORT				0x00000001
+#define	MFI_INIT_READY				0x00000002
+#define	MFI_INIT_MFIMODE			0x00000004
+#define	MFI_INIT_CLEAR_HANDSHAKE		0x00000008
+#define	MFI_INIT_HOTPLUG			0x00000010
+#define	MFI_STOP_ADP				0x00000020
+#define	MFI_RESET_FLAGS		MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT
+
+/*
+ * MFI frame flags
+ */
+#define	MFI_FRAME_POST_IN_REPLY_QUEUE		0x0000
+#define	MFI_FRAME_DONT_POST_IN_REPLY_QUEUE	0x0001
+#define	MFI_FRAME_SGL32				0x0000
+#define	MFI_FRAME_SGL64				0x0002
+#define	MFI_FRAME_SENSE32			0x0000
+#define	MFI_FRAME_SENSE64			0x0004
+#define	MFI_FRAME_DIR_NONE			0x0000
+#define	MFI_FRAME_DIR_WRITE			0x0008
+#define	MFI_FRAME_DIR_READ			0x0010
+#define	MFI_FRAME_DIR_BOTH			0x0018
+
+/*
+ * Definition for cmd_status
+ */
+#define	MFI_CMD_STATUS_POLL_MODE		0xFF
+#define	MFI_CMD_STATUS_SYNC_MODE		0xFF
+
+/*
+ * MFI command opcodes
+ */
+#define	MFI_CMD_OP_INIT				0x00
+#define	MFI_CMD_OP_LD_READ			0x01
+#define	MFI_CMD_OP_LD_WRITE			0x02
+#define	MFI_CMD_OP_LD_SCSI			0x03
+#define	MFI_CMD_OP_PD_SCSI			0x04
+#define	MFI_CMD_OP_DCMD				0x05
+#define	MFI_CMD_OP_ABORT			0x06
+#define	MFI_CMD_OP_SMP				0x07
+#define	MFI_CMD_OP_STP				0x08
+
+#define	DR_DCMD_CTRL_GET_INFO			0x01010000
+
+#define	DR_DCMD_CTRL_CACHE_FLUSH		0x01101000
+#define	DR_FLUSH_CTRL_CACHE			0x01
+#define	DR_FLUSH_DISK_CACHE			0x02
+
+#define	DR_DCMD_CTRL_SHUTDOWN			0x01050000
+#define	DRSAS_ENABLE_DRIVE_SPINDOWN		0x01
+
+#define	DR_DCMD_CTRL_EVENT_GET_INFO		0x01040100
+#define	DR_DCMD_CTRL_EVENT_GET			0x01040300
+#define	DR_DCMD_CTRL_EVENT_WAIT			0x01040500
+#define	DR_DCMD_LD_GET_PROPERTIES		0x03030000
+#define	DR_DCMD_PD_GET_INFO			0x02020000
+
+/*
+ * Solaris Specific MAX values
+ */
+#define	MAX_SGL					24
+/*
+ * MFI command completion codes
+ */
+enum MFI_STAT {
+	MFI_STAT_OK				= 0x00,
+	MFI_STAT_INVALID_CMD			= 0x01,
+	MFI_STAT_INVALID_DCMD			= 0x02,
+	MFI_STAT_INVALID_PARAMETER		= 0x03,
+	MFI_STAT_INVALID_SEQUENCE_NUMBER	= 0x04,
+	MFI_STAT_ABORT_NOT_POSSIBLE		= 0x05,
+	MFI_STAT_APP_HOST_CODE_NOT_FOUND	= 0x06,
+	MFI_STAT_APP_IN_USE			= 0x07,
+	MFI_STAT_APP_NOT_INITIALIZED		= 0x08,
+	MFI_STAT_ARRAY_INDEX_INVALID		= 0x09,
+	MFI_STAT_ARRAY_ROW_NOT_EMPTY		= 0x0a,
+	MFI_STAT_CONFIG_RESOURCE_CONFLICT	= 0x0b,
+	MFI_STAT_DEVICE_NOT_FOUND		= 0x0c,
+	MFI_STAT_DRIVE_TOO_SMALL		= 0x0d,
+	MFI_STAT_FLASH_ALLOC_FAIL		= 0x0e,
+	MFI_STAT_FLASH_BUSY			= 0x0f,
+	MFI_STAT_FLASH_ERROR			= 0x10,
+	MFI_STAT_FLASH_IMAGE_BAD		= 0x11,
+	MFI_STAT_FLASH_IMAGE_INCOMPLETE		= 0x12,
+	MFI_STAT_FLASH_NOT_OPEN			= 0x13,
+	MFI_STAT_FLASH_NOT_STARTED		= 0x14,
+	MFI_STAT_FLUSH_FAILED			= 0x15,
+	MFI_STAT_HOST_CODE_NOT_FOUNT		= 0x16,
+	MFI_STAT_LD_CC_IN_PROGRESS		= 0x17,
+	MFI_STAT_LD_INIT_IN_PROGRESS		= 0x18,
+	MFI_STAT_LD_LBA_OUT_OF_RANGE		= 0x19,
+	MFI_STAT_LD_MAX_CONFIGURED		= 0x1a,
+	MFI_STAT_LD_NOT_OPTIMAL			= 0x1b,
+	MFI_STAT_LD_RBLD_IN_PROGRESS		= 0x1c,
+	MFI_STAT_LD_RECON_IN_PROGRESS		= 0x1d,
+	MFI_STAT_LD_WRONG_RAID_LEVEL		= 0x1e,
+	MFI_STAT_MAX_SPARES_EXCEEDED		= 0x1f,
+	MFI_STAT_MEMORY_NOT_AVAILABLE		= 0x20,
+	MFI_STAT_MFC_HW_ERROR			= 0x21,
+	MFI_STAT_NO_HW_PRESENT			= 0x22,
+	MFI_STAT_NOT_FOUND			= 0x23,
+	MFI_STAT_NOT_IN_ENCL			= 0x24,
+	MFI_STAT_PD_CLEAR_IN_PROGRESS		= 0x25,
+	MFI_STAT_PD_TYPE_WRONG			= 0x26,
+	MFI_STAT_PR_DISABLED			= 0x27,
+	MFI_STAT_ROW_INDEX_INVALID		= 0x28,
+	MFI_STAT_SAS_CONFIG_INVALID_ACTION	= 0x29,
+	MFI_STAT_SAS_CONFIG_INVALID_DATA	= 0x2a,
+	MFI_STAT_SAS_CONFIG_INVALID_PAGE	= 0x2b,
+	MFI_STAT_SAS_CONFIG_INVALID_TYPE	= 0x2c,
+	MFI_STAT_SCSI_DONE_WITH_ERROR		= 0x2d,
+	MFI_STAT_SCSI_IO_FAILED			= 0x2e,
+	MFI_STAT_SCSI_RESERVATION_CONFLICT	= 0x2f,
+	MFI_STAT_SHUTDOWN_FAILED		= 0x30,
+	MFI_STAT_TIME_NOT_SET			= 0x31,
+	MFI_STAT_WRONG_STATE			= 0x32,
+	MFI_STAT_LD_OFFLINE			= 0x33,
+	/* UNUSED: 0x34 to 0xfe */
+	MFI_STAT_INVALID_STATUS			= 0xFF
+};
+
+enum DR_EVT_CLASS {
+	DR_EVT_CLASS_DEBUG		= -2,
+	DR_EVT_CLASS_PROGRESS		= -1,
+	DR_EVT_CLASS_INFO		=  0,
+	DR_EVT_CLASS_WARNING		=  1,
+	DR_EVT_CLASS_CRITICAL		=  2,
+	DR_EVT_CLASS_FATAL		=  3,
+	DR_EVT_CLASS_DEAD		=  4
+};
+
+enum DR_EVT_LOCALE {
+	DR_EVT_LOCALE_LD		= 0x0001,
+	DR_EVT_LOCALE_PD		= 0x0002,
+	DR_EVT_LOCALE_ENCL		= 0x0004,
+	DR_EVT_LOCALE_BBU		= 0x0008,
+	DR_EVT_LOCALE_SAS		= 0x0010,
+	DR_EVT_LOCALE_CTRL		= 0x0020,
+	DR_EVT_LOCALE_CONFIG		= 0x0040,
+	DR_EVT_LOCALE_CLUSTER		= 0x0080,
+	DR_EVT_LOCALE_ALL		= 0xffff
+};
+
+#define	DR_EVT_CFG_CLEARED		0x0004
+#define	DR_EVT_LD_CREATED		0x008a
+#define	DR_EVT_LD_DELETED		0x008b
+#define	DR_EVT_PD_REMOVED_EXT		0x00f8
+#define	DR_EVT_PD_INSERTED_EXT		0x00f7
+
+enum LD_STATE {
+	LD_OFFLINE		= 0,
+	LD_PARTIALLY_DEGRADED	= 1,
+	LD_DEGRADED		= 2,
+	LD_OPTIMAL		= 3,
+	LD_INVALID		= 0xFF
+};
+
+enum DRSAS_EVT {
+	DRSAS_EVT_CONFIG_TGT	= 0,
+	DRSAS_EVT_UNCONFIG_TGT	= 1,
+	DRSAS_EVT_UNCONFIG_SMP	= 2
+};
+
+#define	DMA_OBJ_ALLOCATED	1
+#define	DMA_OBJ_REALLOCATED	2
+#define	DMA_OBJ_FREED		3
+
+/*
+ * dma_obj_t	- Our DMA object
+ * @param buffer	: kernel virtual address
+ * @param size		: size of the data to be allocated
+ * @param acc_handle	: access handle
+ * @param dma_handle	: dma handle
+ * @param dma_cookie	: scatter-gather list
+ * @param dma_attr	: dma attributes for this buffer
+ * Our DMA object. The caller must initialize the size and dma attributes
+ * (dma_attr) fields before allocating the resources.
+ */
+typedef struct {
+	caddr_t			buffer;
+	uint32_t		size;
+	ddi_acc_handle_t	acc_handle;
+	ddi_dma_handle_t	dma_handle;
+	ddi_dma_cookie_t	dma_cookie[DRSAS_MAX_SGE_CNT];
+	ddi_dma_attr_t		dma_attr;
+	uint8_t			status;
+	uint8_t			reserved[3];
+} dma_obj_t;
+
+struct drsas_eventinfo {
+	struct drsas_instance	*instance;
+	int 			tgt;
+	int 			lun;
+	int 			event;
+};
+
+struct drsas_ld {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		reserved[3];
+};
+
+struct drsas_pd {
+	dev_info_t		*dip;
+	uint8_t 		lun_type;
+	uint8_t 		dev_id;
+	uint8_t 		flags;
+	uint8_t 		reserved;
+};
+
+struct drsas_pd_info {
+	uint16_t	deviceId;
+	uint16_t	seqNum;
+	uint8_t		inquiryData[96];
+	uint8_t		vpdPage83[64];
+	uint8_t		notSupported;
+	uint8_t		scsiDevType;
+	uint8_t		a;
+	uint8_t		device_speed;
+	uint32_t	mediaerrcnt;
+	uint32_t	other;
+	uint32_t	pred;
+	uint32_t	lastpred;
+	uint16_t	fwState;
+	uint8_t		disabled;
+	uint8_t		linkspwwd;
+	uint32_t	ddfType;
+	struct {
+		uint8_t	count;
+		uint8_t	isPathBroken;
+		uint8_t	connectorIndex[2];
+		uint8_t	reserved[4];
+		uint64_t sasAddr[2];
+		uint8_t	reserved2[16];
+	} pathInfo;
+};
+
+typedef struct drsas_instance {
+	uint32_t	*producer;
+	uint32_t	*consumer;
+
+	uint32_t	*reply_queue;
+	dma_obj_t	mfi_internal_dma_obj;
+
+	uint8_t		init_id;
+	uint8_t		reserved[3];
+
+	uint16_t	max_num_sge;
+	uint16_t	max_fw_cmds;
+	uint32_t	max_sectors_per_req;
+
+	struct drsas_cmd **cmd_list;
+
+	mlist_t		cmd_pool_list;
+	kmutex_t	cmd_pool_mtx;
+
+	mlist_t		cmd_pend_list;
+	kmutex_t	cmd_pend_mtx;
+
+	dma_obj_t	mfi_evt_detail_obj;
+	struct drsas_cmd *aen_cmd;
+
+	uint32_t	aen_seq_num;
+	uint32_t	aen_class_locale_word;
+
+	scsi_hba_tran_t		*tran;
+
+	kcondvar_t	int_cmd_cv;
+	kmutex_t	int_cmd_mtx;
+
+	kcondvar_t	aen_cmd_cv;
+	kmutex_t	aen_cmd_mtx;
+
+	kcondvar_t	abort_cmd_cv;
+	kmutex_t	abort_cmd_mtx;
+
+	dev_info_t		*dip;
+	ddi_acc_handle_t	pci_handle;
+
+	timeout_id_t	timeout_id;
+	uint32_t	unique_id;
+	uint16_t	fw_outstanding;
+	caddr_t		regmap;
+	ddi_acc_handle_t	regmap_handle;
+	uint8_t		isr_level;
+	ddi_iblock_cookie_t	iblock_cookie;
+	ddi_iblock_cookie_t	soft_iblock_cookie;
+	ddi_softintr_t		soft_intr_id;
+	uint8_t		softint_running;
+	kmutex_t	completed_pool_mtx;
+	mlist_t		completed_pool_list;
+
+	caddr_t		internal_buf;
+	uint32_t	internal_buf_dmac_add;
+	uint32_t	internal_buf_size;
+
+	uint16_t	vendor_id;
+	uint16_t	device_id;
+	uint16_t	subsysvid;
+	uint16_t	subsysid;
+	int		instance;
+	int		baseaddress;
+	char		iocnode[16];
+
+	int		fm_capabilities;
+
+	struct drsas_func_ptr *func_ptr;
+	/* MSI interrupts specific */
+	ddi_intr_handle_t *intr_htable;
+	int		intr_type;
+	int		intr_cnt;
+	size_t		intr_size;
+	uint_t		intr_pri;
+	int		intr_cap;
+
+	ddi_taskq_t	*taskq;
+	struct drsas_ld	*dr_ld_list;
+} drsas_t;
+
+struct drsas_func_ptr {
+	int (*read_fw_status_reg)(struct drsas_instance *);
+	void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *);
+	int (*issue_cmd_in_sync_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	int (*issue_cmd_in_poll_mode)(struct drsas_instance *,
+	    struct drsas_cmd *);
+	void (*enable_intr)(struct drsas_instance *);
+	void (*disable_intr)(struct drsas_instance *);
+	int (*intr_ack)(struct drsas_instance *);
+};
+
+/*
+ * ### Helper routines ###
+ */
+
+/*
+ * con_log() - console log routine
+ * @param level		: indicates the severity of the message.
+ * @fparam mt		: format string
+ *
+ * con_log displays the error messages on the console based on the current
+ * debug level. Also it attaches the appropriate kernel severity level with
+ * the message.
+ *
+ *
+ * console messages debug levels
+ */
+#define	CL_NONE		0	/* No debug information */
+#define	CL_ANN		1	/* print unconditionally, announcements */
+#define	CL_ANN1		2	/* No o/p  */
+#define	CL_DLEVEL1	3	/* debug level 1, informative */
+#define	CL_DLEVEL2	4	/* debug level 2, verbose */
+#define	CL_DLEVEL3	5	/* debug level 3, very verbose */
+
+#ifdef __SUNPRO_C
+#define	__func__ ""
+#endif
+
+#define	con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; }
+
+/*
+ * ### SCSA definitions ###
+ */
+#define	PKT2TGT(pkt)	((pkt)->pkt_address.a_target)
+#define	PKT2LUN(pkt)	((pkt)->pkt_address.a_lun)
+#define	PKT2TRAN(pkt)	((pkt)->pkt_adress.a_hba_tran)
+#define	ADDR2TRAN(ap)	((ap)->a_hba_tran)
+
+#define	TRAN2MR(tran)	(struct drsas_instance *)(tran)->tran_hba_private)
+#define	ADDR2MR(ap)	(TRAN2MR(ADDR2TRAN(ap))
+
+#define	PKT2CMD(pkt)	((struct scsa_cmd *)(pkt)->pkt_ha_private)
+#define	CMD2PKT(sp)	((sp)->cmd_pkt)
+#define	PKT2REQ(pkt)	(&(PKT2CMD(pkt)->request))
+
+#define	CMD2ADDR(cmd)	(&CMD2PKT(cmd)->pkt_address)
+#define	CMD2TRAN(cmd)	(CMD2PKT(cmd)->pkt_address.a_hba_tran)
+#define	CMD2MR(cmd)	(TRAN2MR(CMD2TRAN(cmd)))
+
+#define	CFLAG_DMAVALID		0x0001	/* requires a dma operation */
+#define	CFLAG_DMASEND		0x0002	/* Transfer from the device */
+#define	CFLAG_CONSISTENT	0x0040	/* consistent data transfer */
+
+/*
+ * ### Data structures for ioctl inteface and internal commands ###
+ */
+
+/*
+ * Data direction flags
+ */
+#define	UIOC_RD		0x00001
+#define	UIOC_WR		0x00002
+
+#define	SCP2HOST(scp)		(scp)->device->host	/* to host */
+#define	SCP2HOSTDATA(scp)	SCP2HOST(scp)->hostdata	/* to soft state */
+#define	SCP2CHANNEL(scp)	(scp)->device->channel	/* to channel */
+#define	SCP2TARGET(scp)		(scp)->device->id	/* to target */
+#define	SCP2LUN(scp)		(scp)->device->lun	/* to LUN */
+
+#define	SCSIHOST2ADAP(host)	(((caddr_t *)(host->hostdata))[0])
+#define	SCP2ADAPTER(scp)				\
+	(struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp))
+
+#define	MRDRV_IS_LOGICAL_SCSA(instance, acmd)		\
+	(acmd->device_id < MRDRV_MAX_LD) ? 1 : 0
+#define	MRDRV_IS_LOGICAL(ap)				\
+	((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0
+#define	MAP_DEVICE_ID(instance, ap)			\
+	(ap->a_target)
+
+#define	HIGH_LEVEL_INTR			1
+#define	NORMAL_LEVEL_INTR		0
+
+/*
+ * scsa_cmd  - Per-command mr private data
+ * @param cmd_dmahandle		:  dma handle
+ * @param cmd_dmacookies	:  current dma cookies
+ * @param cmd_pkt		:  scsi_pkt reference
+ * @param cmd_dmacount		:  dma count
+ * @param cmd_cookie		:  next cookie
+ * @param cmd_ncookies		:  cookies per window
+ * @param cmd_cookiecnt		:  cookies per sub-win
+ * @param cmd_nwin		:  number of dma windows
+ * @param cmd_curwin		:  current dma window
+ * @param cmd_dma_offset	:  current window offset
+ * @param cmd_dma_len		:  current window length
+ * @param cmd_flags		:  private flags
+ * @param cmd_cdblen		:  length of cdb
+ * @param cmd_scblen		:  length of scb
+ * @param cmd_buf		:  command buffer
+ * @param channel		:  channel for scsi sub-system
+ * @param target		:  target for scsi sub-system
+ * @param lun			:  LUN for scsi sub-system
+ *
+ * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E)
+ * - Pointed to by pkt_ha_private field in scsi_pkt
+ */
+struct scsa_cmd {
+	ddi_dma_handle_t	cmd_dmahandle;
+	ddi_dma_cookie_t	cmd_dmacookies[DRSAS_MAX_SGE_CNT];
+	struct scsi_pkt		*cmd_pkt;
+	ulong_t			cmd_dmacount;
+	uint_t			cmd_cookie;
+	uint_t			cmd_ncookies;
+	uint_t			cmd_cookiecnt;
+	uint_t			cmd_nwin;
+	uint_t			cmd_curwin;
+	off_t			cmd_dma_offset;
+	ulong_t			cmd_dma_len;
+	ulong_t			cmd_flags;
+	uint_t			cmd_cdblen;
+	uint_t			cmd_scblen;
+	struct buf		*cmd_buf;
+	ushort_t		device_id;
+	uchar_t			islogical;
+	uchar_t			lun;
+	struct drsas_device	*drsas_dev;
+};
+
+
+struct drsas_cmd {
+	union drsas_frame	*frame;
+	uint32_t		frame_phys_addr;
+	uint8_t			*sense;
+	uint32_t		sense_phys_addr;
+	dma_obj_t		frame_dma_obj;
+	uint8_t			frame_dma_obj_status;
+
+	uint32_t		index;
+	uint8_t			sync_cmd;
+	uint8_t			cmd_status;
+	uint16_t		abort_aen;
+	mlist_t			list;
+	uint32_t		frame_count;
+	struct scsa_cmd		*cmd;
+	struct scsi_pkt		*pkt;
+};
+
+#define	MAX_MGMT_ADAPTERS			1024
+#define	IOC_SIGNATURE				"MR-SAS"
+
+#define	IOC_CMD_FIRMWARE			0x0
+#define	DRSAS_DRIVER_IOCTL_COMMON		0xF0010000
+#define	DRSAS_DRIVER_IOCTL_DRIVER_VERSION	0xF0010100
+#define	DRSAS_DRIVER_IOCTL_PCI_INFORMATION	0xF0010200
+#define	DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS	0xF0010300
+
+
+#define	DRSAS_MAX_SENSE_LENGTH			32
+
+struct drsas_mgmt_info {
+
+	uint16_t			count;
+	struct drsas_instance		*instance[MAX_MGMT_ADAPTERS];
+	uint16_t			map[MAX_MGMT_ADAPTERS];
+	int				max_index;
+};
+
+#pragma pack(1)
+
+/*
+ * SAS controller properties
+ */
+struct drsas_ctrl_prop {
+	uint16_t	seq_num;
+	uint16_t	pred_fail_poll_interval;
+	uint16_t	intr_throttle_count;
+	uint16_t	intr_throttle_timeouts;
+
+	uint8_t		rebuild_rate;
+	uint8_t		patrol_read_rate;
+	uint8_t		bgi_rate;
+	uint8_t		cc_rate;
+	uint8_t		recon_rate;
+
+	uint8_t		cache_flush_interval;
+
+	uint8_t		spinup_drv_count;
+	uint8_t		spinup_delay;
+
+	uint8_t		cluster_enable;
+	uint8_t		coercion_mode;
+	uint8_t		disk_write_cache_disable;
+	uint8_t		alarm_enable;
+
+	uint8_t		reserved[44];
+};
+
+/*
+ * SAS controller information
+ */
+struct drsas_ctrl_info {
+	/* PCI device information */
+	struct {
+		uint16_t	vendor_id;
+		uint16_t	device_id;
+		uint16_t	sub_vendor_id;
+		uint16_t	sub_device_id;
+		uint8_t	reserved[24];
+	} pci;
+
+	/* Host interface information */
+	struct {
+		uint8_t	PCIX		: 1;
+		uint8_t	PCIE		: 1;
+		uint8_t	iSCSI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} host_interface;
+
+	/* Device (backend) interface information */
+	struct {
+		uint8_t	SPI		: 1;
+		uint8_t	SAS_3G		: 1;
+		uint8_t	SATA_1_5G	: 1;
+		uint8_t	SATA_3G		: 1;
+		uint8_t	reserved_0	: 4;
+		uint8_t	reserved_1[6];
+		uint8_t	port_count;
+		uint64_t	port_addr[8];
+	} device_interface;
+
+	/* List of components residing in flash. All str are null terminated */
+	uint32_t	image_check_word;
+	uint32_t	image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	built_time[16];
+	} image_component[8];
+
+	/*
+	 * List of flash components that have been flashed on the card, but
+	 * are not in use, pending reset of the adapter. This list will be
+	 * empty if a flash operation has not occurred. All stings are null
+	 * terminated
+	 */
+	uint32_t	pending_image_component_count;
+
+	struct {
+		char	name[8];
+		char	version[32];
+		char	build_date[16];
+		char	build_time[16];
+	} pending_image_component[8];
+
+	uint8_t		max_arms;
+	uint8_t		max_spans;
+	uint8_t		max_arrays;
+	uint8_t		max_lds;
+
+	char		product_name[80];
+	char		serial_no[32];
+
+	/*
+	 * Other physical/controller/operation information. Indicates the
+	 * presence of the hardware
+	 */
+	struct {
+		uint32_t	bbu		: 1;
+		uint32_t	alarm		: 1;
+		uint32_t	nvram		: 1;
+		uint32_t	uart		: 1;
+		uint32_t	reserved	: 28;
+	} hw_present;
+
+	uint32_t	current_fw_time;
+
+	/* Maximum data transfer sizes */
+	uint16_t		max_concurrent_cmds;
+	uint16_t		max_sge_count;
+	uint32_t		max_request_size;
+
+	/* Logical and physical device counts */
+	uint16_t		ld_present_count;
+	uint16_t		ld_degraded_count;
+	uint16_t		ld_offline_count;
+
+	uint16_t		pd_present_count;
+	uint16_t		pd_disk_present_count;
+	uint16_t		pd_disk_pred_failure_count;
+	uint16_t		pd_disk_failed_count;
+
+	/* Memory size information */
+	uint16_t		nvram_size;
+	uint16_t		memory_size;
+	uint16_t		flash_size;
+
+	/* Error counters */
+	uint16_t		mem_correctable_error_count;
+	uint16_t		mem_uncorrectable_error_count;
+
+	/* Cluster information */
+	uint8_t		cluster_permitted;
+	uint8_t		cluster_active;
+	uint8_t		reserved_1[2];
+
+	/* Controller capabilities structures */
+	struct {
+		uint32_t	raid_level_0	: 1;
+		uint32_t	raid_level_1	: 1;
+		uint32_t	raid_level_5	: 1;
+		uint32_t	raid_level_1E	: 1;
+		uint32_t	reserved	: 28;
+	} raid_levels;
+
+	struct {
+		uint32_t	rbld_rate		: 1;
+		uint32_t	cc_rate			: 1;
+		uint32_t	bgi_rate		: 1;
+		uint32_t	recon_rate		: 1;
+		uint32_t	patrol_rate		: 1;
+		uint32_t	alarm_control		: 1;
+		uint32_t	cluster_supported	: 1;
+		uint32_t	bbu			: 1;
+		uint32_t	spanning_allowed	: 1;
+		uint32_t	dedicated_hotspares	: 1;
+		uint32_t	revertible_hotspares	: 1;
+		uint32_t	foreign_config_import	: 1;
+		uint32_t	self_diagnostic		: 1;
+		uint32_t	reserved		: 19;
+	} adapter_operations;
+
+	struct {
+		uint32_t	read_policy	: 1;
+		uint32_t	write_policy	: 1;
+		uint32_t	io_policy	: 1;
+		uint32_t	access_policy	: 1;
+		uint32_t	reserved	: 28;
+	} ld_operations;
+
+	struct {
+		uint8_t	min;
+		uint8_t	max;
+		uint8_t	reserved[2];
+	} stripe_size_operations;
+
+	struct {
+		uint32_t	force_online	: 1;
+		uint32_t	force_offline	: 1;
+		uint32_t	force_rebuild	: 1;
+		uint32_t	reserved	: 29;
+	} pd_operations;
+
+	struct {
+		uint32_t	ctrl_supports_sas	: 1;
+		uint32_t	ctrl_supports_sata	: 1;
+		uint32_t	allow_mix_in_encl	: 1;
+		uint32_t	allow_mix_in_ld		: 1;
+		uint32_t	allow_sata_in_cluster	: 1;
+		uint32_t	reserved		: 27;
+	} pd_mix_support;
+
+	/* Include the controller properties (changeable items) */
+	uint8_t				reserved_2[12];
+	struct drsas_ctrl_prop		properties;
+
+	uint8_t				pad[0x800 - 0x640];
+};
+
+/*
+ * ==================================
+ * MegaRAID SAS2.0 driver definitions
+ * ==================================
+ */
+#define	MRDRV_MAX_NUM_CMD			1024
+
+#define	MRDRV_MAX_PD_CHANNELS			2
+#define	MRDRV_MAX_LD_CHANNELS			2
+#define	MRDRV_MAX_CHANNELS			(MRDRV_MAX_PD_CHANNELS + \
+						MRDRV_MAX_LD_CHANNELS)
+#define	MRDRV_MAX_DEV_PER_CHANNEL		128
+#define	MRDRV_DEFAULT_INIT_ID			-1
+#define	MRDRV_MAX_CMD_PER_LUN			1000
+#define	MRDRV_MAX_LUN				1
+#define	MRDRV_MAX_LD				64
+
+#define	MRDRV_RESET_WAIT_TIME			300
+#define	MRDRV_RESET_NOTICE_INTERVAL		5
+
+#define	DRSAS_IOCTL_CMD				0
+
+/*
+ * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
+ * SGLs based on the size of dma_addr_t
+ */
+#define	IS_DMA64		(sizeof (dma_addr_t) == 8)
+
+#define	IB_MSG_0_OFF			0x10	/* XScale */
+#define	OB_MSG_0_OFF			0x18	/* XScale */
+#define	IB_DOORBELL_OFF			0x20	/* XScale & ROC */
+#define	OB_INTR_STATUS_OFF		0x30	/* XScale & ROC */
+#define	OB_INTR_MASK_OFF		0x34	/* XScale & ROC */
+#define	IB_QPORT_OFF			0x40	/* XScale & ROC */
+#define	OB_DOORBELL_CLEAR_OFF		0xA0	/* ROC */
+#define	OB_SCRATCH_PAD_0_OFF		0xB0	/* ROC */
+#define	OB_INTR_MASK			0xFFFFFFFF
+#define	OB_DOORBELL_CLEAR_MASK		0xFFFFFFFF
+
+/*
+ * All MFI register set macros accept drsas_register_set*
+ */
+#define	WR_IB_MSG_0(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
+
+#define	RD_OB_MSG_0(instance) 		ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF))
+
+#define	WR_IB_DOORBELL(v, instance)	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v))
+
+#define	RD_IB_DOORBELL(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF))
+
+#define	WR_OB_INTR_STATUS(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v))
+
+#define	RD_OB_INTR_STATUS(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF))
+
+#define	WR_OB_INTR_MASK(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v))
+
+#define	RD_OB_INTR_MASK(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF))
+
+#define	WR_IB_QPORT(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v))
+
+#define	WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \
+	(v))
+
+#define	RD_OB_SCRATCH_PAD_0(instance) 	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
+
+/*
+ * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
+ * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
+ * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled.
+ */
+#define	MFI_OB_INTR_STATUS_MASK		0x00000002
+
+/*
+ * This MFI_REPLY_2108_MESSAGE_INTR flag is used also
+ * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has
+ * been set in this flag along with bit 1.
+ */
+#define	MFI_REPLY_2108_MESSAGE_INTR		0x00000001
+#define	MFI_REPLY_2108_MESSAGE_INTR_MASK	0x00000005
+
+#define	MFI_POLL_TIMEOUT_SECS		60
+
+#define	MFI_ENABLE_INTR(instance)  ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1)
+#define	MFI_DISABLE_INTR(instance)					\
+{									\
+	uint32_t disable = 1;						\
+	uint32_t mask =  ddi_get32((instance)->regmap_handle, 		\
+	    (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\
+	mask &= ~disable;						\
+	ddi_put32((instance)->regmap_handle, (uint32_t *)		\
+	    (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask);	\
+}
+
+/* By default, the firmware programs for 8 Kbytes of memory */
+#define	DEFAULT_MFI_MEM_SZ	8192
+#define	MINIMUM_MFI_MEM_SZ	4096
+
+/* DCMD Message Frame MAILBOX0-11 */
+#define	DCMD_MBOX_SZ		12
+
+
+struct drsas_register_set {
+	uint32_t	reserved_0[4];
+
+	uint32_t	inbound_msg_0;
+	uint32_t	inbound_msg_1;
+	uint32_t	outbound_msg_0;
+	uint32_t	outbound_msg_1;
+
+	uint32_t	inbound_doorbell;
+	uint32_t	inbound_intr_status;
+	uint32_t	inbound_intr_mask;
+
+	uint32_t	outbound_doorbell;
+	uint32_t	outbound_intr_status;
+	uint32_t	outbound_intr_mask;
+
+	uint32_t	reserved_1[2];
+
+	uint32_t	inbound_queue_port;
+	uint32_t	outbound_queue_port;
+
+	uint32_t 	reserved_2[22];
+
+	uint32_t 	outbound_doorbell_clear;
+
+	uint32_t 	reserved_3[3];
+
+	uint32_t 	outbound_scratch_pad;
+
+	uint32_t 	reserved_4[3];
+
+	uint32_t 	inbound_low_queue_port;
+
+	uint32_t 	inbound_high_queue_port;
+
+	uint32_t 	reserved_5;
+	uint32_t 	index_registers[820];
+};
+
+struct drsas_sge32 {
+	uint32_t	phys_addr;
+	uint32_t	length;
+};
+
+struct drsas_sge64 {
+	uint64_t	phys_addr;
+	uint32_t	length;
+};
+
+union drsas_sgl {
+	struct drsas_sge32	sge32[1];
+	struct drsas_sge64	sge64[1];
+};
+
+struct drsas_header {
+	uint8_t		cmd;
+	uint8_t		sense_len;
+	uint8_t		cmd_status;
+	uint8_t		scsi_status;
+
+	uint8_t		target_id;
+	uint8_t		lun;
+	uint8_t		cdb_len;
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+	uint32_t	data_xferlen;
+};
+
+union drsas_sgl_frame {
+	struct drsas_sge32	sge32[8];
+	struct drsas_sge64	sge64[5];
+};
+
+struct drsas_init_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	data_xfer_len;
+
+	uint32_t	queue_info_new_phys_addr_lo;
+	uint32_t	queue_info_new_phys_addr_hi;
+	uint32_t	queue_info_old_phys_addr_lo;
+	uint32_t	queue_info_old_phys_addr_hi;
+
+	uint32_t	reserved_4[6];
+};
+
+struct drsas_init_queue_info {
+	uint32_t		init_flags;
+	uint32_t		reply_queue_entries;
+
+	uint32_t		reply_queue_start_phys_addr_lo;
+	uint32_t		reply_queue_start_phys_addr_hi;
+	uint32_t		producer_index_phys_addr_lo;
+	uint32_t		producer_index_phys_addr_hi;
+	uint32_t		consumer_index_phys_addr_lo;
+	uint32_t		consumer_index_phys_addr_hi;
+};
+
+struct drsas_io_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			access_byte;
+	uint8_t			reserved_0;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		lba_count;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint32_t		start_lba_lo;
+	uint32_t		start_lba_hi;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_pthru_frame {
+	uint8_t			cmd;
+	uint8_t			sense_len;
+	uint8_t			cmd_status;
+	uint8_t			scsi_status;
+
+	uint8_t			target_id;
+	uint8_t			lun;
+	uint8_t			cdb_len;
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+	uint32_t		data_xfer_len;
+
+	uint32_t		sense_buf_phys_addr_lo;
+	uint32_t		sense_buf_phys_addr_hi;
+
+	uint8_t			cdb[16];
+	union drsas_sgl		sgl;
+};
+
+struct drsas_dcmd_frame {
+	uint8_t			cmd;
+	uint8_t			reserved_0;
+	uint8_t			cmd_status;
+	uint8_t			reserved_1[4];
+	uint8_t			sge_count;
+
+	uint32_t		context;
+	uint8_t			req_id;
+	uint8_t			msgvector;
+	uint16_t		pad_0;
+
+	uint16_t		flags;
+	uint16_t		timeout;
+
+	uint32_t		data_xfer_len;
+	uint32_t		opcode;
+
+	union {
+		uint8_t b[DCMD_MBOX_SZ];
+		uint16_t s[6];
+		uint32_t w[3];
+	} mbox;
+
+	union drsas_sgl		sgl;
+};
+
+struct drsas_abort_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_0;
+	uint8_t		cmd_status;
+
+	uint8_t		reserved_1;
+	uint32_t	reserved_2;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	reserved_3;
+	uint32_t	reserved_4;
+
+	uint32_t	abort_context;
+	uint32_t	pad_1;
+
+	uint32_t	abort_mfi_phys_addr_lo;
+	uint32_t	abort_mfi_phys_addr_hi;
+
+	uint32_t	reserved_5[6];
+};
+
+struct drsas_smp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		reserved_2[3];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint64_t	sas_addr;
+
+	union drsas_sgl	sgl[2];
+};
+
+struct drsas_stp_frame {
+	uint8_t		cmd;
+	uint8_t		reserved_1;
+	uint8_t		cmd_status;
+	uint8_t		connection_status;
+
+	uint8_t		target_id;
+	uint8_t		reserved_2[2];
+	uint8_t		sge_count;
+
+	uint32_t	context;
+	uint8_t		req_id;
+	uint8_t		msgvector;
+	uint16_t	pad_0;
+
+	uint16_t	flags;
+	uint16_t	timeout;
+
+	uint32_t	data_xfer_len;
+
+	uint16_t	fis[10];
+	uint32_t	stp_flags;
+	union drsas_sgl	sgl;
+};
+
+union drsas_frame {
+	struct drsas_header		hdr;
+	struct drsas_init_frame		init;
+	struct drsas_io_frame		io;
+	struct drsas_pthru_frame	pthru;
+	struct drsas_dcmd_frame		dcmd;
+	struct drsas_abort_frame	abort;
+	struct drsas_smp_frame		smp;
+	struct drsas_stp_frame		stp;
+
+	uint8_t			raw_bytes[64];
+};
+
+typedef struct drsas_pd_address {
+	uint16_t	device_id;
+	uint16_t	encl_id;
+
+	union {
+		struct {
+			uint8_t encl_index;
+			uint8_t slot_number;
+		} pd_address;
+		struct {
+			uint8_t	encl_position;
+			uint8_t	encl_connector_index;
+		} encl_address;
+	}address;
+
+	uint8_t	scsi_dev_type;
+
+	union {
+		uint8_t		port_bitmap;
+		uint8_t		port_numbers;
+	} connected;
+
+	uint64_t		sas_addr[2];
+} drsas_pd_address_t;
+
+union drsas_evt_class_locale {
+	struct {
+		uint16_t	locale;
+		uint8_t		reserved;
+		int8_t		class;
+	} members;
+
+	uint32_t	word;
+};
+
+struct drsas_evt_log_info {
+	uint32_t	newest_seq_num;
+	uint32_t	oldest_seq_num;
+	uint32_t	clear_seq_num;
+	uint32_t	shutdown_seq_num;
+	uint32_t	boot_seq_num;
+};
+
+struct drsas_progress {
+	uint16_t	progress;
+	uint16_t	elapsed_seconds;
+};
+
+struct drsas_evtarg_ld {
+	uint16_t	target_id;
+	uint8_t		ld_index;
+	uint8_t		reserved;
+};
+
+struct drsas_evtarg_pd {
+	uint16_t	device_id;
+	uint8_t		encl_index;
+	uint8_t		slot_number;
+};
+
+struct drsas_evt_detail {
+	uint32_t	seq_num;
+	uint32_t	time_stamp;
+	uint32_t	code;
+	union drsas_evt_class_locale	cl;
+	uint8_t		arg_type;
+	uint8_t		reserved1[15];
+
+	union {
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint8_t			cdb_length;
+			uint8_t			sense_length;
+			uint8_t			reserved[2];
+			uint8_t			cdb[16];
+			uint8_t			sense[64];
+		} cdbSense;
+
+		struct drsas_evtarg_ld		ld;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint64_t		count;
+		} ld_count;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_ld	ld;
+		} ld_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prevOwner;
+			uint32_t		newOwner;
+		} ld_owner;
+
+		struct {
+			uint64_t		ld_lba;
+			uint64_t		pd_lba;
+			struct drsas_evtarg_ld	ld;
+			struct drsas_evtarg_pd	pd;
+		} ld_lba_pd_lba;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			struct drsas_progress	prog;
+		} ld_prog;
+
+		struct {
+			struct drsas_evtarg_ld	ld;
+			uint32_t		prev_state;
+			uint32_t		new_state;
+		} ld_state;
+
+		struct {
+			uint64_t		strip;
+			struct drsas_evtarg_ld	ld;
+		} ld_strip;
+
+		struct drsas_evtarg_pd		pd;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		err;
+		} pd_err;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+		} pd_lba;
+
+		struct {
+			uint64_t		lba;
+			struct drsas_evtarg_pd	pd;
+			struct drsas_evtarg_ld	ld;
+		} pd_lba_ld;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			struct drsas_progress	prog;
+		} pd_prog;
+
+		struct {
+			struct drsas_evtarg_pd	pd;
+			uint32_t		prevState;
+			uint32_t		newState;
+		} pd_state;
+
+		struct {
+			uint16_t	vendorId;
+			uint16_t	deviceId;
+			uint16_t	subVendorId;
+			uint16_t	subDeviceId;
+		} pci;
+
+		uint32_t	rate;
+		char		str[96];
+
+		struct {
+			uint32_t	rtc;
+			uint32_t	elapsedSeconds;
+		} time;
+
+		struct {
+			uint32_t	ecar;
+			uint32_t	elog;
+			char		str[64];
+		} ecc;
+
+		drsas_pd_address_t	pd_addr;
+
+		uint8_t		b[96];
+		uint16_t	s[48];
+		uint32_t	w[24];
+		uint64_t	d[12];
+	} args;
+
+	char	description[128];
+
+};
+
+/* only 63 are usable by the application */
+#define	MAX_LOGICAL_DRIVES			64
+/* only 255 physical devices may be used */
+#define	MAX_PHYSICAL_DEVICES			256
+#define	MAX_PD_PER_ENCLOSURE			64
+/* maximum disks per array */
+#define	MAX_ROW_SIZE				32
+/* maximum spans per logical drive */
+#define	MAX_SPAN_DEPTH				8
+/* maximum number of arrays a hot spare may be dedicated to */
+#define	MAX_ARRAYS_DEDICATED			16
+/* maximum number of arrays which may exist */
+#define	MAX_ARRAYS				128
+/* maximum number of foreign configs that may ha managed at once */
+#define	MAX_FOREIGN_CONFIGS			8
+/* maximum spares (global and dedicated combined) */
+#define	MAX_SPARES_FOR_THE_CONTROLLER		MAX_PHYSICAL_DEVICES
+/* maximum possible Target IDs (i.e. 0 to 63) */
+#define	MAX_TARGET_ID				63
+/* maximum number of supported enclosures */
+#define	MAX_ENCLOSURES				32
+/* maximum number of PHYs per controller */
+#define	MAX_PHYS_PER_CONTROLLER			16
+/* maximum number of LDs per array (due to DDF limitations) */
+#define	MAX_LDS_PER_ARRAY			16
+
+/*
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ *
+ * Logical Drive commands
+ *
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ */
+#define	DR_DCMD_LD	0x03000000,	/* Logical Device (LD) opcodes */
+
+/*
+ * Input:	dcmd.opcode	- DR_DCMD_LD_GET_LIST
+ *		dcmd.mbox	- reserved
+ *		dcmd.sge IN	- ptr to returned DR_LD_LIST structure
+ * Desc:	Return the logical drive list structure
+ * Status:	No error
+ */
+
+/*
+ * defines the logical drive reference structure
+ */
+typedef	union _DR_LD_REF {	/* LD reference structure */
+	struct {
+		uint8_t	targetId; /* LD target id (0 to MAX_TARGET_ID) */
+		uint8_t	reserved; /* reserved for in line with DR_PD_REF */
+		uint16_t seqNum;  /* Sequence Number */
+	} ld_ref;
+	uint32_t ref;		/* shorthand reference to full 32-bits */
+} DR_LD_REF;			/* 4 bytes */
+
+/*
+ * defines the logical drive list structure
+ */
+typedef struct _DR_LD_LIST {
+	uint32_t	ldCount;	/* number of LDs */
+	uint32_t	reserved;	/* pad to 8-byte boundary */
+	struct {
+		DR_LD_REF ref;	/* LD reference */
+		uint8_t	state;		/* current LD state (DR_LD_STATE) */
+		uint8_t	reserved[3];	/* pad to 8-byte boundary */
+		uint64_t size;		/* LD size */
+	} ldList[MAX_LOGICAL_DRIVES];
+} DR_LD_LIST;
+
+struct drsas_drv_ver {
+	uint8_t	signature[12];
+	uint8_t	os_name[16];
+	uint8_t	os_ver[12];
+	uint8_t	drv_name[20];
+	uint8_t	drv_ver[32];
+	uint8_t	drv_rel_date[20];
+};
+
+#define	PCI_TYPE0_ADDRESSES		6
+#define	PCI_TYPE1_ADDRESSES		2
+#define	PCI_TYPE2_ADDRESSES		5
+
+struct drsas_pci_common_header {
+	uint16_t	vendorID;		/* (ro) */
+	uint16_t	deviceID;		/* (ro) */
+	uint16_t	command;		/* Device control */
+	uint16_t	status;
+	uint8_t		revisionID;		/* (ro) */
+	uint8_t		progIf;			/* (ro) */
+	uint8_t		subClass;		/* (ro) */
+	uint8_t		baseClass;		/* (ro) */
+	uint8_t		cacheLineSize;		/* (ro+) */
+	uint8_t		latencyTimer;		/* (ro+) */
+	uint8_t		headerType;		/* (ro) */
+	uint8_t		bist;			/* Built in self test */
+
+	union {
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE0_ADDRESSES];
+		uint32_t	cis;
+		uint16_t	subVendorID;
+		uint16_t	subSystemID;
+		uint32_t	romBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	reserved2;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;	/* (ro) */
+		uint8_t		minimumGrant;	/* (ro) */
+		uint8_t		maximumLatency;	/* (ro) */
+	    } type_0;
+
+	    struct {
+		uint32_t	baseAddresses[PCI_TYPE1_ADDRESSES];
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		uint8_t		ioBase;
+		uint8_t		ioLimit;
+		uint16_t	secondaryStatus;
+		uint16_t	memoryBase;
+		uint16_t	memoryLimit;
+		uint16_t	prefetchBase;
+		uint16_t	prefetchLimit;
+		uint32_t	prefetchBaseUpper32;
+		uint32_t	prefetchLimitUpper32;
+		uint16_t	ioBaseUpper16;
+		uint16_t	ioLimitUpper16;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved1[3];
+		uint32_t	romBaseAddress;
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_1;
+
+	    struct {
+		uint32_t	socketRegistersBaseAddress;
+		uint8_t		capabilitiesPtr;
+		uint8_t		reserved;
+		uint16_t	secondaryStatus;
+		uint8_t		primaryBus;
+		uint8_t		secondaryBus;
+		uint8_t		subordinateBus;
+		uint8_t		secondaryLatency;
+		struct {
+			uint32_t	base;
+			uint32_t	limit;
+		} range[PCI_TYPE2_ADDRESSES-1];
+		uint8_t		interruptLine;
+		uint8_t		interruptPin;
+		uint16_t	bridgeControl;
+	    } type_2;
+	} header;
+};
+
+struct drsas_pci_link_capability {
+	union {
+	    struct {
+		uint32_t linkSpeed		:4;
+		uint32_t linkWidth		:6;
+		uint32_t aspmSupport		:2;
+		uint32_t losExitLatency		:3;
+		uint32_t l1ExitLatency		:3;
+		uint32_t rsvdp			:6;
+		uint32_t portNumber		:8;
+	    } bits;
+
+	    uint32_t asUlong;
+	} cap;
+
+};
+
+struct drsas_pci_link_status_capability {
+	union {
+	    struct {
+		uint16_t linkSpeed		:4;
+		uint16_t negotiatedLinkWidth	:6;
+		uint16_t linkTrainingError	:1;
+		uint16_t linkTraning		:1;
+		uint16_t slotClockConfig	:1;
+		uint16_t rsvdZ			:3;
+	    } bits;
+
+	    uint16_t asUshort;
+	} stat_cap;
+
+	uint16_t reserved;
+
+};
+
+struct drsas_pci_capabilities {
+	struct drsas_pci_link_capability	linkCapability;
+	struct drsas_pci_link_status_capability linkStatusCapability;
+};
+
+struct drsas_pci_information
+{
+	uint32_t		busNumber;
+	uint8_t			deviceNumber;
+	uint8_t			functionNumber;
+	uint8_t			interruptVector;
+	uint8_t			reserved;
+	struct drsas_pci_common_header pciHeaderInfo;
+	struct drsas_pci_capabilities capability;
+	uint8_t			reserved2[32];
+};
+
+struct drsas_ioctl {
+	uint16_t	version;
+	uint16_t	controller_id;
+	uint8_t		signature[8];
+	uint32_t	reserved_1;
+	uint32_t	control_code;
+	uint32_t	reserved_2[2];
+	uint8_t		frame[64];
+	union drsas_sgl_frame sgl_frame;
+	uint8_t		sense_buff[DRSAS_MAX_SENSE_LENGTH];
+	uint8_t		data[1];
+};
+
+struct drsas_aen {
+	uint16_t	host_no;
+	uint16_t	cmd_status;
+	uint32_t	seq_num;
+	uint32_t	class_locale_word;
+};
+#pragma pack()
+
+#ifndef	DDI_VENDOR_LSI
+#define	DDI_VENDOR_LSI		"LSI"
+#endif /* DDI_VENDOR_LSI */
+
+static int	drsas_getinfo(dev_info_t *, ddi_info_cmd_t,  void *, void **);
+static int	drsas_attach(dev_info_t *, ddi_attach_cmd_t);
+static int	drsas_reset(dev_info_t *, ddi_reset_cmd_t);
+static int	drsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int	drsas_open(dev_t *, int, int, cred_t *);
+static int	drsas_close(dev_t, int, int, cred_t *);
+static int	drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int	drsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register
+		    struct scsi_pkt *, struct buf *, int, int, int, int,
+		    int (*)(), caddr_t);
+static int	drsas_tran_start(struct scsi_address *,
+		    register struct scsi_pkt *);
+static int	drsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int	drsas_tran_reset(struct scsi_address *, int);
+static int	drsas_tran_getcap(struct scsi_address *, char *, int);
+static int	drsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void	drsas_tran_destroy_pkt(struct scsi_address *,
+		    struct scsi_pkt *);
+static void	drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void	drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static uint_t	drsas_isr();
+static uint_t	drsas_softintr();
+
+static int	init_mfi(struct drsas_instance *);
+static int	drsas_free_dma_obj(struct drsas_instance *, dma_obj_t);
+static int	drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *,
+		    uchar_t);
+static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *);
+static void	return_mfi_pkt(struct drsas_instance *,
+		    struct drsas_cmd *);
+
+static void	free_space_for_mfi(struct drsas_instance *);
+static void	free_additional_dma_buffer(struct drsas_instance *);
+static int	alloc_additional_dma_buffer(struct drsas_instance *);
+static int	read_fw_status_reg_ppc(struct drsas_instance *);
+static void	issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *);
+static int	issue_cmd_in_poll_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static int	issue_cmd_in_sync_mode_ppc(struct drsas_instance *,
+		    struct drsas_cmd *);
+static void	enable_intr_ppc(struct drsas_instance *);
+static void	disable_intr_ppc(struct drsas_instance *);
+static int	intr_ack_ppc(struct drsas_instance *);
+static int	mfi_state_transition_to_ready(struct drsas_instance *);
+static void	destroy_mfi_frame_pool(struct drsas_instance *);
+static int	create_mfi_frame_pool(struct drsas_instance *);
+static int	drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *,
+		    struct buf *, int, int (*)());
+static int	drsas_dma_move(struct drsas_instance *,
+			struct scsi_pkt *, struct buf *);
+static void	flush_cache(struct drsas_instance *instance);
+static void	display_scsi_inquiry(caddr_t);
+static int	start_mfi_aen(struct drsas_instance *instance);
+static int	handle_drv_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_ioctl(struct drsas_instance *instance,
+		    struct drsas_ioctl *ioctl, int mode);
+static int	handle_mfi_aen(struct drsas_instance *instance,
+		    struct drsas_aen *aen);
+static void	fill_up_drv_ver(struct drsas_drv_ver *dv);
+static struct drsas_cmd *build_cmd(struct drsas_instance *instance,
+		    struct scsi_address *ap, struct scsi_pkt *pkt,
+		    uchar_t *cmd_done);
+static int	register_mfi_aen(struct drsas_instance *instance,
+		    uint32_t seq_num, uint32_t class_locale_word);
+static int	issue_mfi_pthru(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_dcmd(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_smp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	issue_mfi_stp(struct drsas_instance *instance, struct
+		    drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int	abort_aen_cmd(struct drsas_instance *instance,
+		    struct drsas_cmd *cmd_to_abort);
+
+static int	drsas_common_check(struct drsas_instance *instance,
+		    struct  drsas_cmd *cmd);
+static void	drsas_fm_init(struct drsas_instance *instance);
+static void	drsas_fm_fini(struct drsas_instance *instance);
+static int	drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
+		    const void *);
+static void	drsas_fm_ereport(struct drsas_instance *instance,
+		    char *detail);
+static int	drsas_check_dma_handle(ddi_dma_handle_t handle);
+static int	drsas_check_acc_handle(ddi_acc_handle_t handle);
+
+static void	drsas_rem_intrs(struct drsas_instance *instance);
+static int	drsas_add_intrs(struct drsas_instance *instance, int intr_type);
+
+static void	drsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static int	drsas_tran_bus_config(dev_info_t *, uint_t,
+		    ddi_bus_config_op_t, void *, dev_info_t **);
+static int	drsas_parse_devname(char *, int *, int *);
+static int	drsas_config_all_devices(struct drsas_instance *);
+static int 	drsas_config_scsi_device(struct drsas_instance *,
+		    struct scsi_device *, dev_info_t **);
+static int 	drsas_config_ld(struct drsas_instance *, uint16_t,
+				uint8_t, dev_info_t **);
+static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t,
+			uint8_t);
+static int	drsas_name_node(dev_info_t *, char *, int);
+static void	drsas_issue_evt_taskq(struct drsas_eventinfo *);
+static int	drsas_service_evt(struct drsas_instance *, int, int, int,
+			uint64_t);
+static int	drsas_mode_sense_build(struct scsi_pkt *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_H_ */
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
new file mode 100644
index 0000000000..4154a77796
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
@@ -0,0 +1,212 @@
+/*
+ * dr_sas_list.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DR_SAS_LIST_H_
+#define	_DR_SAS_LIST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct mlist_head {
+	struct mlist_head *next, *prev;
+};
+
+typedef struct mlist_head mlist_t;
+
+#define	LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define	LIST_HEAD(name) \
+	struct mlist_head name = LIST_HEAD_INIT(name)
+
+#define	INIT_LIST_HEAD(ptr) { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+}
+
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_add(struct mlist_head *new,
+	struct mlist_head *prev,
+	struct mlist_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static void mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_del(struct mlist_head *prev,
+			struct mlist_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static void mlist_del_init(struct mlist_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static int mlist_empty(struct mlist_head *head)
+{
+	return (head->next == head);
+}
+
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+	struct mlist_head *first = list->next;
+
+	if (first != list) {
+		struct mlist_head *last = list->prev;
+		struct mlist_head *at = head->next;
+
+		first->prev = head;
+		head->next = first;
+
+		last->next = at;
+		at->prev = last;
+	}
+}
+
+
+/*
+ * mlist_entry - get the struct for this entry
+ * @ptr:	the &struct mlist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define	mlist_entry(ptr, type, member) \
+	((type *)((size_t)(ptr) - offsetof(type, member)))
+
+
+/*
+ * mlist_for_each	-	iterate over a list
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each(pos, head) \
+	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+		pos = pos->next, prefetch(pos->next))
+
+
+/*
+ * mlist_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct mlist_head to use as a loop counter.
+ * @n:		another &struct mlist_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define	mlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_LIST_H_ */
diff --git a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
index da00160b68..2efb178ff1 100644
--- a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
+++ b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
@@ -532,8 +532,7 @@ oce_drain_rq_cq(void *arg)
 			if (dev->function_mode & FLEX10_MODE) {
 				if (cqe->u0.s.vlan_tag_present &&
 				    cqe->u0.s.qnq) {
-					oce_rx_insert_tag(mp,
-					    cqe->u0.s.vlan_tag);
+					oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag);
 				}
 			} else if (cqe->u0.s.vlan_tag_present) {
 				oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag);
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 4e1979cf54..61a5353365 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -2989,6 +2990,9 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
 	case MAC_PROP_WL_MLME:
 		minsize = sizeof (wl_mlme_t);
 		break;
+	case MAC_PROP_VN_PROMISC_FILTERED:
+		minsize = sizeof (boolean_t);
+		break;
 	}
 
 	return (valsize >= minsize);
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index dc1132941b..dc1e40b424 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -3170,7 +3171,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
 	}
 
 	if ((mcip->mci_state_flags & MCIS_IS_VNIC) &&
-	    type == MAC_CLIENT_PROMISC_ALL) {
+	    type == MAC_CLIENT_PROMISC_ALL &&
+	    (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED)) {
 		/*
 		 * The function is being invoked by the upper MAC client
 		 * of a VNIC. The VNIC should only see the traffic
@@ -4032,16 +4034,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
 /*
  * To get the capabilities that MAC layer cares about, such as rings, factory
  * mac address, vnic or not, it should directly invoke this function.  If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
  */
 boolean_t
 i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
 {
 	mac_impl_t *mip = (mac_impl_t *)mh;
 
-	if (mip->mi_bridge_link != NULL)
-		return (cap == MAC_CAPAB_NO_ZCOPY);
+	if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY)
+		return (B_TRUE);
 	else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
 		return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
 	else
@@ -5411,3 +5412,23 @@ mac_client_set_rings(mac_client_handle_t mch, int rxrings, int txrings)
 		mrp->mrp_ntxrings = txrings;
 	}
 }
+
+boolean_t
+mac_get_promisc_filtered(mac_client_handle_t mch)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	return (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED);
+}
+
+void
+mac_set_promisc_filtered(mac_client_handle_t mch, boolean_t enable)
+{
+	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
+	if (enable)
+		mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED;
+	else
+		mcip->mci_protect_flags &= ~MPT_FLAG_PROMISC_FILTERED;
+}
diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c
index 0dc825492e..4d5201a994 100644
--- a/usr/src/uts/common/io/mac/mac_protect.c
+++ b/usr/src/uts/common/io/mac/mac_protect.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/strsun.h>
@@ -2267,6 +2268,9 @@ mac_protect_init(mac_client_impl_t *mcip)
 	    sizeof (dhcpv6_cid_t), offsetof(dhcpv6_cid_t, dc_node));
 	avl_create(&mcip->mci_v6_dyn_ip, compare_dhcpv6_ip,
 	    sizeof (dhcpv6_addr_t), offsetof(dhcpv6_addr_t, da_node));
+
+	if (mcip->mci_state_flags & MCIS_IS_VNIC)
+		mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED;
 }
 
 void
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 38967e5d15..06a5ac8cbf 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -530,12 +530,13 @@ enum pkt_type {
 
 /*
  * In general we do port based hashing to spread traffic over different
- * softrings. The below tunable allows to override that behavior. Setting it
- * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
- * is also the applicable to ipv6 packets carrying multiple optional headers
+ * softrings. The below tunables allows to override that behavior. Setting it
+ * to B_TRUE allows to do a fanout based on src ipv6/ipv4 address. This behavior
+ * is also applicable to ipv6 packets carrying multiple optional headers
  * and other uncommon packet types.
  */
 boolean_t mac_src_ipv6_fanout = B_FALSE;
+boolean_t mac_src_ipv4_fanout = B_FALSE;
 
 /*
  * Pair of local and remote ports in the transport header
@@ -765,13 +766,14 @@ int	fanout_unalligned = 0;
 /*
  * mac_rx_srs_long_fanout
  *
- * The fanout routine for IPv6
+ * The fanout routine for IPv6 (and IPv4 when VLANs are in use).
  */
 static int
 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 {
 	ip6_t		*ip6h;
+	struct ip	*ip4h;
 	uint8_t		*whereptr;
 	uint_t		hash;
 	uint16_t	remlen;
@@ -839,7 +841,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 		 */
 		if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
 		    mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
-			goto src_based_fanout;
+			goto ipv6_src_based_fanout;
 		}
 		whereptr = (uint8_t *)ip6h + hdr_len;
 
@@ -856,7 +858,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 			 */
 			if (mp->b_cont != NULL &&
 			    whereptr + PORTS_SIZE > mp->b_wptr) {
-				goto src_based_fanout;
+				goto ipv6_src_based_fanout;
 			}
 			break;
 		default:
@@ -890,7 +892,85 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 
 			/* For all other protocol, do source based fanout */
 		default:
-			goto src_based_fanout;
+			goto ipv6_src_based_fanout;
+		}
+	} else if (sap == ETHERTYPE_IP) {
+		boolean_t	modifiable = B_TRUE;
+
+		ASSERT(MBLKL(mp) >= hdrsize);
+
+		ip4h = (struct ip *)(mp->b_rptr + hdrsize);
+
+		if ((unsigned char *)ip4h == mp->b_wptr) {
+			/*
+			 * The first mblk_t only includes the mac header.
+			 * Note that it is safe to change the mp pointer here,
+			 * as the subsequent operation does not assume mp
+			 * points to the start of the mac header.
+			 */
+			mp = mp->b_cont;
+
+			/*
+			 * Make sure ip4h holds the full base ip structure
+			 * up through the destination address.  It might not
+			 * hold any of the options though.
+			 */
+			if (mp == NULL)
+				return (-1);
+
+			if (MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
+				modifiable = (DB_REF(mp) == 1);
+
+				if (modifiable &&
+				    !pullupmsg(mp, IP_SIMPLE_HDR_LENGTH))
+					return (-1);
+			}
+
+			ip4h = (struct ip *)mp->b_rptr;
+		}
+
+		if (!modifiable || !(OK_32PTR((char *)ip4h))) {
+			/*
+			 * If ip4h is not aligned fanout to the default ring.
+			 * Note that this may cause packets reordering.
+			 */
+			*indx = 0;
+			*type = OTH;
+			fanout_unalligned++;
+			return (0);
+		}
+
+		/* Do src based fanout if below tunable is set to B_TRUE. */
+		if (mac_src_ipv4_fanout)
+			goto ipv4_src_based_fanout;
+
+		/* If the transport is TCP, we try to do port based fanout */
+		if (ip4h->ip_p == IPPROTO_TCP) {
+			int	hdr_len;
+
+			hdr_len = ip4h->ip_hl << 2;
+			/* set whereptr to point to tcphdr */
+			whereptr = (uint8_t *)ip4h + hdr_len;
+
+			/*
+			 * If ip4h does not hold the complete ip header
+			 * including options, or if both ports in the TCP
+			 * header are not part of the mblk, do src_based_fanout
+			 * (the second case covers the first one so we only
+			 * need one test).
+			 */
+			if (mp->b_cont != NULL &&
+			    whereptr + PORTS_SIZE > mp->b_wptr)
+				goto ipv4_src_based_fanout;
+
+			hash = HASH_ADDR(ip4h->ip_src.s_addr,
+			    *(uint32_t *)whereptr);
+			*indx = COMPUTE_INDEX(hash,
+			    mac_srs->srs_tcp_ring_count);
+			*type = OTH;
+		} else {
+			/* For all other protocols, do source based fanout */
+			goto ipv4_src_based_fanout;
 		}
 	} else {
 		*indx = 0;
@@ -898,11 +978,17 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 	}
 	return (0);
 
-src_based_fanout:
+ipv6_src_based_fanout:
 	hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
 	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 	*type = OTH;
 	return (0);
+
+ipv4_src_based_fanout:
+	hash = HASH_ADDR(ip4h->ip_src.s_addr, (uint32_t)0);
+	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+	*type = OTH;
+	return (0);
 }
 
 /*
diff --git a/usr/src/uts/common/io/mr_sas/fusion.h b/usr/src/uts/common/io/mr_sas/fusion.h
new file mode 100644
index 0000000000..36fb3cb11a
--- /dev/null
+++ b/usr/src/uts/common/io/mr_sas/fusion.h
@@ -0,0 +1,561 @@
+/*
+ * fusion.h
+ *
+ * Solaris MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Swaminathan K S
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rasheed
+ *		Shakeel Bukhari
+ */
+
+
+#ifndef	_FUSION_H_
+#define	_FUSION_H_
+
+#define	U64	uint64_t
+#define	U32	uint32_t
+#define	U16	uint16_t
+#define	U8	uint8_t
+#define	S8	char
+#define	S16	short
+#define	S32	int
+
+/* MPI2 defines */
+#define	MPI2_REPLY_POST_HOST_INDEX_OFFSET	(0x6C)
+#define	MPI2_FUNCTION_IOC_INIT			(0x02) /* IOC Init */
+#define	MPI2_WHOINIT_HOST_DRIVER		(0x04)
+#define	MPI2_VERSION_MAJOR			(0x02)
+#define	MPI2_VERSION_MINOR			(0x00)
+#define	MPI2_VERSION_MAJOR_MASK			(0xFF00)
+#define	MPI2_VERSION_MAJOR_SHIFT		(8)
+#define	MPI2_VERSION_MINOR_MASK			(0x00FF)
+#define	MPI2_VERSION_MINOR_SHIFT		(0)
+#define	MPI2_VERSION	((MPI2_VERSION_MAJOR << MPI2_VERSION_MAJOR_SHIFT) | \
+			MPI2_VERSION_MINOR)
+#define	MPI2_HEADER_VERSION_UNIT		(0x10)
+#define	MPI2_HEADER_VERSION_DEV			(0x00)
+#define	MPI2_HEADER_VERSION_UNIT_MASK		(0xFF00)
+#define	MPI2_HEADER_VERSION_UNIT_SHIFT		(8)
+#define	MPI2_HEADER_VERSION_DEV_MASK		(0x00FF)
+#define	MPI2_HEADER_VERSION_DEV_SHIFT		(0)
+#define	MPI2_HEADER_VERSION			((MPI2_HEADER_VERSION_UNIT \
+						<< 8) | \
+						MPI2_HEADER_VERSION_DEV)
+#define	MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR	(0x03)
+#define	MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG	(0x8000)
+#define	MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG	(0x0400)
+#define	MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP	(0x0003)
+#define	MPI2_SCSIIO_EEDPFLAGS_CHECK_APPTAG	(0x0200)
+#define	MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD	(0x0100)
+#define	MPI2_SCSIIO_EEDPFLAGS_INSERT_OP		(0x0004)
+#define	MPI2_FUNCTION_SCSI_IO_REQUEST		(0x00) /* SCSI IO */
+#define	MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY	(0x06)
+#define	MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO		(0x00)
+#define	MPI2_SGE_FLAGS_64_BIT_ADDRESSING	(0x02)
+#define	MPI2_SCSIIO_CONTROL_WRITE		(0x01000000)
+#define	MPI2_SCSIIO_CONTROL_READ		(0x02000000)
+#define	MPI2_REQ_DESCRIPT_FLAGS_TYPE_MASK	(0x0E)
+#define	MPI2_RPY_DESCRIPT_FLAGS_UNUSED		(0x0F)
+#define	MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS	(0x00)
+#define	MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK	(0x0F)
+#define	MPI2_WRSEQ_FLUSH_KEY_VALUE		(0x0)
+#define	MPI2_WRITE_SEQUENCE_OFFSET		(0x00000004)
+#define	MPI2_WRSEQ_1ST_KEY_VALUE		(0xF)
+#define	MPI2_WRSEQ_2ND_KEY_VALUE		(0x4)
+#define	MPI2_WRSEQ_3RD_KEY_VALUE		(0xB)
+#define	MPI2_WRSEQ_4TH_KEY_VALUE		(0x2)
+#define	MPI2_WRSEQ_5TH_KEY_VALUE		(0x7)
+#define	MPI2_WRSEQ_6TH_KEY_VALUE		(0xD)
+
+/* Invader defines */
+#define	MPI2_TYPE_CUDA					0x2
+#define	MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH	0x4000
+#define	MR_RL_FLAGS_GRANT_DESTINATION_CPU0			0x00
+#define	MR_RL_FLAGS_GRANT_DESTINATION_CPU1			0x10
+#define	MR_RL_FLAGS_GRANT_DESTINATION_CUDA			0x80
+#define	MR_RL_FLAGS_SEQ_NUM_ENABLE					0x8
+#define	MPI2_NSEG_FLAGS_SHIFT						4
+
+
+#define	MR_PD_INVALID				0xFFFF
+#define	MAX_SPAN_DEPTH				8
+#define	MAX_RAIDMAP_SPAN_DEPTH			(MAX_SPAN_DEPTH)
+#define	MAX_ROW_SIZE				32
+#define	MAX_RAIDMAP_ROW_SIZE			(MAX_ROW_SIZE)
+#define	MAX_LOGICAL_DRIVES			64
+#define	MAX_RAIDMAP_LOGICAL_DRIVES		(MAX_LOGICAL_DRIVES)
+#define	MAX_RAIDMAP_VIEWS			(MAX_LOGICAL_DRIVES)
+#define	MAX_ARRAYS				128
+#define	MAX_RAIDMAP_ARRAYS			(MAX_ARRAYS)
+#define	MAX_PHYSICAL_DEVICES			256
+#define	MAX_RAIDMAP_PHYSICAL_DEVICES		(MAX_PHYSICAL_DEVICES)
+
+/* get the mapping information of LD */
+#define	MR_DCMD_LD_MAP_GET_INFO			0x0300e101
+
+#ifndef	MPI2_POINTER
+#define	MPI2_POINTER	*
+#endif
+
+#pragma pack(1)
+
+typedef struct _MPI25_IEEE_SGE_CHAIN64
+{
+	U64	Address;
+	U32	Length;
+	U16	Reserved1;
+	U8	NextChainOffset;
+	U8	Flags;
+} MPI25_IEEE_SGE_CHAIN64, MPI2_POINTER PTR_MPI25_IEEE_SGE_CHAIN64,
+    Mpi25IeeeSgeChain64_t, MPI2_POINTER pMpi25IeeeSgeChain64_t;
+
+typedef struct _MPI2_SGE_SIMPLE_UNION
+{
+	U32	FlagsLength;
+	union
+	{
+		U32	Address32;
+		U64	Address64;
+	} u1;
+} MPI2_SGE_SIMPLE_UNION, MPI2_POINTER PTR_MPI2_SGE_SIMPLE_UNION,
+    Mpi2SGESimpleUnion_t, MPI2_POINTER pMpi2SGESimpleUnion_t;
+
+typedef struct
+{
+	U8	CDB[20];			/* 0x00 */
+	U32	PrimaryReferenceTag;		/* 0x14 */
+	U16	PrimaryApplicationTag;		/* 0x18 */
+	U16	PrimaryApplicationTagMask;	/* 0x1A */
+	U32	TransferLength;			/* 0x1C */
+} MPI2_SCSI_IO_CDB_EEDP32, MPI2_POINTER PTR_MPI2_SCSI_IO_CDB_EEDP32,
+    Mpi2ScsiIoCdbEedp32_t, MPI2_POINTER pMpi2ScsiIoCdbEedp32_t;
+
+typedef struct _MPI2_SGE_CHAIN_UNION
+{
+	U16	Length;
+	U8	NextChainOffset;
+	U8	Flags;
+	union
+	{
+		U32	Address32;
+		U64	Address64;
+	} u1;
+} MPI2_SGE_CHAIN_UNION, MPI2_POINTER PTR_MPI2_SGE_CHAIN_UNION,
+    Mpi2SGEChainUnion_t, MPI2_POINTER pMpi2SGEChainUnion_t;
+
+typedef struct _MPI2_IEEE_SGE_SIMPLE32
+{
+	U32	Address;
+	U32	FlagsLength;
+} MPI2_IEEE_SGE_SIMPLE32, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE32,
+    Mpi2IeeeSgeSimple32_t, MPI2_POINTER pMpi2IeeeSgeSimple32_t;
+
+typedef struct _MPI2_IEEE_SGE_SIMPLE64
+{
+	U64	Address;
+	U32	Length;
+	U16	Reserved1;
+	U8	Reserved2;
+	U8	Flags;
+} MPI2_IEEE_SGE_SIMPLE64, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE64,
+    Mpi2IeeeSgeSimple64_t, MPI2_POINTER pMpi2IeeeSgeSimple64_t;
+
+typedef union _MPI2_IEEE_SGE_SIMPLE_UNION
+{
+	MPI2_IEEE_SGE_SIMPLE32	Simple32;
+	MPI2_IEEE_SGE_SIMPLE64	Simple64;
+} MPI2_IEEE_SGE_SIMPLE_UNION, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE_UNION,
+    Mpi2IeeeSgeSimpleUnion_t, MPI2_POINTER pMpi2IeeeSgeSimpleUnion_t;
+
+typedef	MPI2_IEEE_SGE_SIMPLE32	MPI2_IEEE_SGE_CHAIN32;
+typedef	MPI2_IEEE_SGE_SIMPLE64	MPI2_IEEE_SGE_CHAIN64;
+
+typedef union _MPI2_IEEE_SGE_CHAIN_UNION
+{
+	MPI2_IEEE_SGE_CHAIN32	Chain32;
+	MPI2_IEEE_SGE_CHAIN64	Chain64;
+} MPI2_IEEE_SGE_CHAIN_UNION, MPI2_POINTER PTR_MPI2_IEEE_SGE_CHAIN_UNION,
+    Mpi2IeeeSgeChainUnion_t, MPI2_POINTER pMpi2IeeeSgeChainUnion_t;
+
+typedef union _MPI2_SGE_IO_UNION
+{
+	MPI2_SGE_SIMPLE_UNION		MpiSimple;
+	MPI2_SGE_CHAIN_UNION		MpiChain;
+	MPI2_IEEE_SGE_SIMPLE_UNION	IeeeSimple;
+	MPI2_IEEE_SGE_CHAIN_UNION	IeeeChain;
+} MPI2_SGE_IO_UNION, MPI2_POINTER PTR_MPI2_SGE_IO_UNION,
+    Mpi2SGEIOUnion_t, MPI2_POINTER pMpi2SGEIOUnion_t;
+
+typedef union
+{
+	U8				CDB32[32];
+	MPI2_SCSI_IO_CDB_EEDP32		EEDP32;
+	MPI2_SGE_SIMPLE_UNION		SGE;
+} MPI2_SCSI_IO_CDB_UNION, MPI2_POINTER PTR_MPI2_SCSI_IO_CDB_UNION,
+    Mpi2ScsiIoCdb_t, MPI2_POINTER pMpi2ScsiIoCdb_t;
+
+/* Default Request Descriptor */
+typedef struct _MPI2_DEFAULT_REQUEST_DESCRIPTOR
+{
+	U8		RequestFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		LMID;				/* 0x04 */
+	U16		DescriptorTypeDependent;	/* 0x06 */
+} MPI2_DEFAULT_REQUEST_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_DEFAULT_REQUEST_DESCRIPTOR,
+    Mpi2DefaultRequestDescriptor_t,
+    MPI2_POINTER pMpi2DefaultRequestDescriptor_t;
+
+/* High Priority Request Descriptor */
+typedef struct _MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR
+{
+	U8		RequestFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		LMID;				/* 0x04 */
+	U16		Reserved1;			/* 0x06 */
+} MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR,
+    Mpi2HighPriorityRequestDescriptor_t,
+    MPI2_POINTER pMpi2HighPriorityRequestDescriptor_t;
+
+/* SCSI IO Request Descriptor */
+typedef struct _MPI2_SCSI_IO_REQUEST_DESCRIPTOR
+{
+	U8		RequestFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		LMID;				/* 0x04 */
+	U16		DevHandle;			/* 0x06 */
+} MPI2_SCSI_IO_REQUEST_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_SCSI_IO_REQUEST_DESCRIPTOR,
+    Mpi2SCSIIORequestDescriptor_t,
+    MPI2_POINTER pMpi2SCSIIORequestDescriptor_t;
+
+/* SCSI Target Request Descriptor */
+typedef struct _MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR
+{
+	U8		RequestFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		LMID;				/* 0x04 */
+	U16		IoIndex;			/* 0x06 */
+} MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR,
+    Mpi2SCSITargetRequestDescriptor_t,
+    MPI2_POINTER pMpi2SCSITargetRequestDescriptor_t;
+
+/* RAID Accelerator Request Descriptor */
+typedef struct _MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR
+{
+	U8		RequestFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		LMID;				/* 0x04 */
+	U16		Reserved;			/* 0x06 */
+} MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR,
+    Mpi2RAIDAcceleratorRequestDescriptor_t,
+    MPI2_POINTER pMpi2RAIDAcceleratorRequestDescriptor_t;
+
+/* Default Reply Descriptor */
+typedef struct _MPI2_DEFAULT_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		DescriptorTypeDependent1;	/* 0x02 */
+	U32		DescriptorTypeDependent2;	/* 0x04 */
+} MPI2_DEFAULT_REPLY_DESCRIPTOR, MPI2_POINTER PTR_MPI2_DEFAULT_REPLY_DESCRIPTOR,
+    Mpi2DefaultReplyDescriptor_t, MPI2_POINTER pMpi2DefaultReplyDescriptor_t;
+
+/* Address Reply Descriptor */
+typedef struct _MPI2_ADDRESS_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U32		ReplyFrameAddress;		/* 0x04 */
+} MPI2_ADDRESS_REPLY_DESCRIPTOR, MPI2_POINTER PTR_MPI2_ADDRESS_REPLY_DESCRIPTOR,
+    Mpi2AddressReplyDescriptor_t, MPI2_POINTER pMpi2AddressReplyDescriptor_t;
+
+/* SCSI IO Success Reply Descriptor */
+typedef struct _MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U16		TaskTag;			/* 0x04 */
+	U16		Reserved1;			/* 0x06 */
+} MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR,
+    Mpi2SCSIIOSuccessReplyDescriptor_t,
+    MPI2_POINTER pMpi2SCSIIOSuccessReplyDescriptor_t;
+
+/* TargetAssist Success Reply Descriptor */
+typedef struct _MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U8		SequenceNumber;			/* 0x04 */
+	U8		Reserved1;			/* 0x05 */
+	U16		IoIndex;			/* 0x06 */
+} MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR,
+    Mpi2TargetAssistSuccessReplyDescriptor_t,
+    MPI2_POINTER pMpi2TargetAssistSuccessReplyDescriptor_t;
+
+/* Target Command Buffer Reply Descriptor */
+typedef struct _MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U8		VP_ID;				/* 0x02 */
+	U8		Flags;				/* 0x03 */
+	U16		InitiatorDevHandle;		/* 0x04 */
+	U16		IoIndex;			/* 0x06 */
+} MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR,
+    Mpi2TargetCommandBufferReplyDescriptor_t,
+    MPI2_POINTER pMpi2TargetCommandBufferReplyDescriptor_t;
+
+/* RAID Accelerator Success Reply Descriptor */
+typedef struct _MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR
+{
+	U8		ReplyFlags;			/* 0x00 */
+	U8		MSIxIndex;			/* 0x01 */
+	U16		SMID;				/* 0x02 */
+	U32		Reserved;			/* 0x04 */
+} MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR,
+    MPI2_POINTER PTR_MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR,
+    Mpi2RAIDAcceleratorSuccessReplyDescriptor_t,
+    MPI2_POINTER pMpi2RAIDAcceleratorSuccessReplyDescriptor_t;
+
+/* union of Reply Descriptors */
+typedef union _MPI2_REPLY_DESCRIPTORS_UNION
+{
+	MPI2_DEFAULT_REPLY_DESCRIPTOR			Default;
+	MPI2_ADDRESS_REPLY_DESCRIPTOR			AddressReply;
+	MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR		SCSIIOSuccess;
+	MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR	TargetAssistSuccess;
+	MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR	TargetCommandBuffer;
+	MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR	RAIDAcceleratorSuccess;
+	U64						Words;
+} MPI2_REPLY_DESCRIPTORS_UNION, MPI2_POINTER PTR_MPI2_REPLY_DESCRIPTORS_UNION,
+    Mpi2ReplyDescriptorsUnion_t, MPI2_POINTER pMpi2ReplyDescriptorsUnion_t;
+
+/* IOCInit Request message */
+typedef struct _MPI2_IOC_INIT_REQUEST
+{
+	U8		WhoInit;				/* 0x00 */
+	U8		Reserved1;				/* 0x01 */
+	U8		ChainOffset;				/* 0x02 */
+	U8		Function;				/* 0x03 */
+	U16		Reserved2;				/* 0x04 */
+	U8		Reserved3;				/* 0x06 */
+	U8		MsgFlags;				/* 0x07 */
+	U8		VP_ID;					/* 0x08 */
+	U8		VF_ID;					/* 0x09 */
+	U16		Reserved4;				/* 0x0A */
+	U16		MsgVersion;				/* 0x0C */
+	U16		HeaderVersion;				/* 0x0E */
+	U32		Reserved5;				/* 0x10 */
+	U16		Reserved6;				/* 0x14 */
+	U8		Reserved7;				/* 0x16 */
+	U8		HostMSIxVectors;			/* 0x17 */
+	U16		Reserved8;				/* 0x18 */
+	U16		SystemRequestFrameSize;			/* 0x1A */
+	U16		ReplyDescriptorPostQueueDepth;		/* 0x1C */
+	U16		ReplyFreeQueueDepth;			/* 0x1E */
+	U32		SenseBufferAddressHigh;			/* 0x20 */
+	U32		SystemReplyAddressHigh;			/* 0x24 */
+	U64		SystemRequestFrameBaseAddress;		/* 0x28 */
+	U64		ReplyDescriptorPostQueueAddress;	/* 0x30 */
+	U64		ReplyFreeQueueAddress;			/* 0x38 */
+	U64		TimeStamp;				/* 0x40 */
+} MPI2_IOC_INIT_REQUEST, MPI2_POINTER PTR_MPI2_IOC_INIT_REQUEST,
+    Mpi2IOCInitRequest_t, MPI2_POINTER pMpi2IOCInitRequest_t;
+
+
+typedef struct _MR_DEV_HANDLE_INFO {
+
+	/* Send bitmap of LDs that are idle with respect to FP */
+	U16		curDevHdl;
+
+	/* bitmap of valid device handles. */
+	U8		validHandles;
+	U8		reserved;
+	/* 0x04 dev handles for all the paths. */
+	U16		devHandle[2];
+} MR_DEV_HANDLE_INFO;				/* 0x08, Total Size */
+
+typedef struct _MR_ARRAY_INFO {
+	U16	pd[MAX_RAIDMAP_ROW_SIZE];
+} MR_ARRAY_INFO;			/* 0x40, Total Size */
+
+typedef struct _MR_QUAD_ELEMENT {
+	U64		logStart;			/* 0x00 */
+	U64		logEnd;				/* 0x08 */
+	U64		offsetInSpan;			/* 0x10 */
+	U32		diff;				/* 0x18 */
+	U32		reserved1;			/* 0x1C */
+} MR_QUAD_ELEMENT;					/* 0x20, Total size */
+
+typedef struct _MR_SPAN_INFO {
+	U32		noElements;			/* 0x00 */
+	U32		reserved1;			/* 0x04 */
+	MR_QUAD_ELEMENT	quads[MAX_RAIDMAP_SPAN_DEPTH];	/* 0x08 */
+} MR_SPAN_INFO;						/* 0x108, Total size */
+
+typedef struct _MR_LD_SPAN_ {				/* SPAN structure */
+	/* 0x00, starting block number in array */
+	U64		startBlk;
+
+	/* 0x08, number of blocks */
+	U64		numBlks;
+
+	/* 0x10, array reference */
+	U16		arrayRef;
+
+	U8		reserved[6];	/* 0x12 */
+} MR_LD_SPAN;				/* 0x18, Total Size */
+
+typedef struct _MR_SPAN_BLOCK_INFO {
+	/* number of rows/span */
+	U64		num_rows;
+
+	MR_LD_SPAN	span;				/* 0x08 */
+	MR_SPAN_INFO	block_span_info;		/* 0x20 */
+} MR_SPAN_BLOCK_INFO;					/* 0x128, Total Size */
+
+typedef struct _MR_LD_RAID {
+	struct {
+		U32	fpCapable	:1;
+		U32	reserved5	:3;
+		U32	ldPiMode	:4;
+		U32	pdPiMode	:4;
+
+		/* FDE or controller encryption (MR_LD_ENCRYPTION_TYPE) */
+		U32	encryptionType	:8;
+
+		U32	fpWriteCapable	:1;
+		U32	fpReadCapable	:1;
+		U32	fpWriteAcrossStripe:1;
+		U32	fpReadAcrossStripe:1;
+		U32	reserved4	:8;
+	} capability;			/* 0x00 */
+	U32	reserved6;
+	U64	size;			/* 0x08, LD size in blocks */
+	U8	spanDepth;		/* 0x10, Total Number of Spans */
+	U8	level;			/* 0x11, RAID level */
+	/* 0x12, shift-count to get stripe size (0=512, 1=1K, 7=64K, etc.) */
+	U8	stripeShift;
+	U8	rowSize;		/* 0x13, number of disks in a row */
+	/* 0x14, number of data disks in a row */
+	U8	rowDataSize;
+	U8	writeMode;		/* 0x15, WRITE_THROUGH or WRITE_BACK */
+
+	/* 0x16, To differentiate between RAID1 and RAID1E */
+	U8	PRL;
+
+	U8	SRL;			/* 0x17 */
+	U16	targetId;		/* 0x18, ld Target Id. */
+
+	/* 0x1a, state of ld, state corresponds to MR_LD_STATE */
+	U8	ldState;
+
+	/* 0x1b, Pre calculate region type requests based on MFC etc.. */
+	U8	regTypeReqOnWrite;
+
+	U8	modFactor;		/* 0x1c, same as rowSize */
+	/*
+	 * 0x1d, region lock type used for read, valid only if
+	 * regTypeOnReadIsValid=1
+	 */
+	U8	regTypeReqOnRead;
+	U16	seqNum;			/* 0x1e, LD sequence number */
+
+	struct {
+		/* This LD requires sync command before completing */
+		U32	ldSyncRequired:1;
+		U32	reserved:31;
+	} flags;			/* 0x20 */
+
+	U8	reserved3[0x5C];	/* 0x24 */
+} MR_LD_RAID;				/* 0x80, Total Size */
+
+typedef struct _MR_LD_SPAN_MAP {
+	MR_LD_RAID		ldRaid;	/* 0x00 */
+
+	/* 0x80, needed for GET_ARM() - R0/1/5 only. */
+	U8			dataArmMap[MAX_RAIDMAP_ROW_SIZE];
+
+	MR_SPAN_BLOCK_INFO	spanBlock[MAX_RAIDMAP_SPAN_DEPTH]; /* 0xA0 */
+} MR_LD_SPAN_MAP;	/* 0x9E0 */
+
+typedef struct _MR_FW_RAID_MAP {
+	/* total size of this structure, including this field */
+	U32			totalSize;
+	union {
+		/* Simple method of version checking variables */
+		struct {
+			U32	maxLd;
+			U32	maxSpanDepth;
+			U32	maxRowSize;
+			U32	maxPdCount;
+			U32	maxArrays;
+		} validationInfo;
+		U32	version[5];
+		U32	reserved1[5];
+	} u1;
+
+	U32			ldCount;		/* count of lds */
+	U32			Reserved1;
+
+	/*
+	 * 0x20 This doesn't correspond to
+	 * FW Ld Tgt Id to LD, but will purge. For example: if tgt Id is 4
+	 * and FW LD is 2, and there is only one LD, FW will populate the
+	 * array like this. [0xFF, 0xFF, 0xFF, 0xFF, 0x0.....]. This is to
+	 * help reduce the entire strcture size if there are few LDs or
+	 * driver is looking info for 1 LD only.
+	 */
+	U8			ldTgtIdToLd[MAX_RAIDMAP_LOGICAL_DRIVES+ \
+				MAX_RAIDMAP_VIEWS]; /* 0x20 */
+	/* timeout value used by driver in FP IOs */
+	U8			fpPdIoTimeoutSec;
+	U8			reserved2[7];
+	MR_ARRAY_INFO		arMapInfo[MAX_RAIDMAP_ARRAYS];	/* 0x00a8 */
+	MR_DEV_HANDLE_INFO	devHndlInfo[MAX_RAIDMAP_PHYSICAL_DEVICES];
+
+	/* 0x28a8-[0 -MAX_RAIDMAP_LOGICAL_DRIVES+MAX_RAIDMAP_VIEWS+1]; */
+	MR_LD_SPAN_MAP		ldSpanMap[1];
+}MR_FW_RAID_MAP;					/* 0x3288, Total Size */
+
+typedef struct _LD_TARGET_SYNC {
+	U8	ldTargetId;
+	U8	reserved;
+	U16	seqNum;
+} LD_TARGET_SYNC;
+
+#pragma pack()
+
+struct IO_REQUEST_INFO {
+	U64	ldStartBlock;
+	U32	numBlocks;
+	U16	ldTgtId;
+	U8	isRead;
+	U16	devHandle;
+	U64	pdBlock;
+	U8	fpOkForIo;
+	U8	ldPI;
+};
+
+#endif /* _FUSION_H_ */
diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.c b/usr/src/uts/common/io/mr_sas/ld_pd_map.c
new file mode 100644
index 0000000000..8fac4e7b5a
--- /dev/null
+++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.c
@@ -0,0 +1,536 @@
+/*
+ * **********************************************************************
+ *
+ * ld_pd_map.c
+ *
+ * Solaris MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Swaminathan K S
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rasheed
+ *		Shakeel Bukhari
+ *
+ *
+ * This module contains functions for device drivers
+ * to get pd-ld mapping information.
+ *
+ * **********************************************************************
+ */
+
+#include <sys/scsi/scsi.h>
+#include "mr_sas.h"
+#include "ld_pd_map.h"
+
+/*
+ * This function will check if FAST IO is possible on this logical drive
+ * by checking the EVENT information availabe in the driver
+ */
+#define	MR_LD_STATE_OPTIMAL 3
+#define	ABS_DIFF(a, b)   (((a) > (b)) ? ((a) - (b)) : ((b) - (a)))
+
+static void mr_update_load_balance_params(MR_FW_RAID_MAP_ALL *,
+    PLD_LOAD_BALANCE_INFO);
+
+#define	FALSE 0
+#define	TRUE 1
+
+typedef	U64	REGION_KEY;
+typedef	U32	REGION_LEN;
+extern int 	debug_level_g;
+
+
+MR_LD_RAID
+*MR_LdRaidGet(U32 ld, MR_FW_RAID_MAP_ALL *map)
+{
+	return (&map->raidMap.ldSpanMap[ld].ldRaid);
+}
+
+U16
+MR_GetLDTgtId(U32 ld, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.ldSpanMap[ld].ldRaid.targetId);
+}
+
+
+static MR_SPAN_BLOCK_INFO *
+MR_LdSpanInfoGet(U32 ld, MR_FW_RAID_MAP_ALL *map)
+{
+	return (&map->raidMap.ldSpanMap[ld].spanBlock[0]);
+}
+
+static U8
+MR_LdDataArmGet(U32 ld, U32 armIdx, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.ldSpanMap[ld].dataArmMap[armIdx]);
+}
+
+static U16
+MR_ArPdGet(U32 ar, U32 arm, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.arMapInfo[ar].pd[arm]);
+}
+
+static U16
+MR_LdSpanArrayGet(U32 ld, U32 span, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.ldSpanMap[ld].spanBlock[span].span.arrayRef);
+}
+
+static U16
+MR_PdDevHandleGet(U32 pd, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.devHndlInfo[pd].curDevHdl);
+}
+
+U16
+MR_TargetIdToLdGet(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map)
+{
+	return (map->raidMap.ldTgtIdToLd[ldTgtId]);
+}
+
+U16
+MR_CheckDIF(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map)
+{
+	MR_LD_RAID	*raid;
+	U32		ld;
+
+	ld = MR_TargetIdToLdGet(ldTgtId, map);
+
+	if (ld >= MAX_LOGICAL_DRIVES) {
+		return (FALSE);
+	}
+
+	raid = MR_LdRaidGet(ld, map);
+
+	return (raid->capability.ldPiMode == 0x8);
+}
+
+static MR_LD_SPAN *
+MR_LdSpanPtrGet(U32 ld, U32 span, MR_FW_RAID_MAP_ALL *map)
+{
+	return (&map->raidMap.ldSpanMap[ld].spanBlock[span].span);
+}
+
+/*
+ * This function will validate Map info data provided by FW
+ */
+U8
+MR_ValidateMapInfo(MR_FW_RAID_MAP_ALL *map, PLD_LOAD_BALANCE_INFO lbInfo)
+{
+	MR_FW_RAID_MAP *pFwRaidMap = &map->raidMap;
+	U32 fwsize = sizeof (MR_FW_RAID_MAP) - sizeof (MR_LD_SPAN_MAP) +
+	    (sizeof (MR_LD_SPAN_MAP) * pFwRaidMap->ldCount);
+
+	if (pFwRaidMap->totalSize != fwsize) {
+
+		con_log(CL_ANN1, (CE_NOTE,
+		    "map info structure size 0x%x is "
+		    "not matching with ld count\n", fwsize));
+		/* sizeof (foo) returns size_t, which is *LONG*. */
+		con_log(CL_ANN1, (CE_NOTE, "span map 0x%x total size 0x%x\n",\
+		    (int)sizeof (MR_LD_SPAN_MAP), pFwRaidMap->totalSize));
+
+		return (0);
+	}
+
+	mr_update_load_balance_params(map, lbInfo);
+
+	return (1);
+}
+
+U32
+MR_GetSpanBlock(U32 ld, U64 row, U64 *span_blk, MR_FW_RAID_MAP_ALL *map,
+    int *div_error)
+{
+	MR_SPAN_BLOCK_INFO *pSpanBlock = MR_LdSpanInfoGet(ld, map);
+	MR_QUAD_ELEMENT	*qe;
+	MR_LD_RAID	*raid = MR_LdRaidGet(ld, map);
+	U32		span, j;
+
+	for (span = 0; span < raid->spanDepth; span++, pSpanBlock++) {
+		for (j = 0; j < pSpanBlock->block_span_info.noElements; j++) {
+			qe = &pSpanBlock->block_span_info.quads[j];
+			if (qe->diff == 0) {
+				*div_error = 1;
+				return (span);
+			}
+			if (qe->logStart <= row && row <= qe->logEnd &&
+			    (((row - qe->logStart) % qe->diff)) == 0) {
+				if (span_blk != NULL) {
+					U64	blk;
+					blk = ((row - qe->logStart) /
+					    (qe->diff));
+
+					blk = (blk + qe->offsetInSpan) <<
+					    raid->stripeShift;
+					*span_blk = blk;
+				}
+				return (span);
+			}
+		}
+	}
+	return (span);
+}
+
+
+/*
+ * *************************************************************
+ *
+ * This routine calculates the arm, span and block for
+ * the specified stripe and reference in stripe.
+ *
+ * Inputs :
+ *
+ *    ld   - Logical drive number
+ *    stripRow        - Stripe number
+ *    stripRef    - Reference in stripe
+ *
+ * Outputs :
+ *
+ *    span          - Span number
+ *    block         - Absolute Block number in the physical disk
+ */
+U8
+MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow,
+    U16 stripRef, U64 *pdBlock, U16 *pDevHandle,
+    MPI2_SCSI_IO_VENDOR_UNIQUE *pRAID_Context, MR_FW_RAID_MAP_ALL *map)
+{
+	MR_LD_RAID	*raid = MR_LdRaidGet(ld, map);
+	U32		pd, arRef;
+	U8		physArm, span;
+	U64		row;
+	int		error_code = 0;
+	U8		retval = TRUE;
+	U32		rowMod;
+	U32		armQ;
+	U32		arm;
+
+	row = (stripRow / raid->rowDataSize);
+
+	if (raid->level == 6) {
+		U32 logArm =  (stripRow % (raid->rowDataSize));
+
+		if (raid->rowSize == 0) {
+			return (FALSE);
+		}
+		rowMod = (row % (raid->rowSize));
+		armQ = raid->rowSize-1-rowMod;
+		arm = armQ+1+logArm;
+		if (arm >= raid->rowSize)
+			arm -= raid->rowSize;
+		physArm = (U8)arm;
+	} else {
+		if (raid->modFactor == 0)
+			return (FALSE);
+		physArm = MR_LdDataArmGet(ld,
+		    (stripRow % (raid->modFactor)), map);
+	}
+	if (raid->spanDepth == 1) {
+		span = 0;
+		*pdBlock = row << raid->stripeShift;
+	} else
+		span = (U8)MR_GetSpanBlock(ld, row, pdBlock, map, &error_code);
+
+	if (error_code == 1)
+		return (FALSE);
+
+	/* Get the array on which this span is present. */
+	arRef		= MR_LdSpanArrayGet(ld, span, map);
+	/* Get the Pd. */
+	pd		= MR_ArPdGet(arRef, physArm, map);
+	/* Get dev handle from Pd. */
+	if (pd != MR_PD_INVALID) {
+		*pDevHandle	= MR_PdDevHandleGet(pd, map);
+	} else {
+		*pDevHandle = MR_PD_INVALID; /* set dev handle as invalid. */
+		if ((raid->level >= 5) &&
+		    ((instance->device_id != PCI_DEVICE_ID_LSI_INVADER) ||
+		    (instance->device_id == PCI_DEVICE_ID_LSI_INVADER &&
+		    raid->regTypeReqOnRead != REGION_TYPE_UNUSED))) {
+			pRAID_Context->regLockFlags = REGION_TYPE_EXCLUSIVE;
+		} else if (raid->level == 1) {
+			/* Get Alternate Pd. */
+			pd = MR_ArPdGet(arRef, physArm + 1, map);
+			/* Get dev handle from Pd. */
+			if (pd != MR_PD_INVALID)
+				*pDevHandle = MR_PdDevHandleGet(pd, map);
+		}
+	}
+
+	*pdBlock += stripRef + MR_LdSpanPtrGet(ld, span, map)->startBlk;
+
+	pRAID_Context->spanArm = (span << RAID_CTX_SPANARM_SPAN_SHIFT) |
+	    physArm;
+
+	return (retval);
+}
+
+
+
+/*
+ * ***********************************************************************
+ *
+ * MR_BuildRaidContext function
+ *
+ * This function will initiate command processing.  The start/end row and strip
+ * information is calculated then the lock is acquired.
+ * This function will return 0 if region lock
+ * was acquired OR return num strips ???
+ */
+
+U8
+MR_BuildRaidContext(struct mrsas_instance *instance,
+    struct IO_REQUEST_INFO *io_info, MPI2_SCSI_IO_VENDOR_UNIQUE *pRAID_Context,
+    MR_FW_RAID_MAP_ALL *map)
+{
+	MR_LD_RAID	*raid;
+	U32		ld, stripSize, stripe_mask;
+	U64		endLba, endStrip, endRow;
+	U64		start_row, start_strip;
+	REGION_KEY	regStart;
+	REGION_LEN	regSize;
+	U8		num_strips, numRows;
+	U16		ref_in_start_stripe;
+	U16		ref_in_end_stripe;
+
+	U64		ldStartBlock;
+	U32		numBlocks, ldTgtId;
+	U8		isRead;
+	U8		retval = 0;
+
+	ldStartBlock = io_info->ldStartBlock;
+	numBlocks = io_info->numBlocks;
+	ldTgtId = io_info->ldTgtId;
+	isRead = io_info->isRead;
+
+	if (map == NULL) {
+		io_info->fpOkForIo = FALSE;
+		return (FALSE);
+	}
+
+	ld = MR_TargetIdToLdGet(ldTgtId, map);
+
+	if (ld >= MAX_LOGICAL_DRIVES) {
+		io_info->fpOkForIo = FALSE;
+		return (FALSE);
+	}
+
+	raid = MR_LdRaidGet(ld, map);
+
+	stripSize = 1 << raid->stripeShift;
+	stripe_mask = stripSize-1;
+	/*
+	 * calculate starting row and stripe, and number of strips and rows
+	 */
+	start_strip		= ldStartBlock >> raid->stripeShift;
+	ref_in_start_stripe	= (U16)(ldStartBlock & stripe_mask);
+	endLba			= ldStartBlock + numBlocks - 1;
+	ref_in_end_stripe	= (U16)(endLba & stripe_mask);
+	endStrip		= endLba >> raid->stripeShift;
+	num_strips		= (U8)(endStrip - start_strip + 1);
+	/* Check to make sure is not deviding by zero */
+	if (raid->rowDataSize == 0)
+		return (FALSE);
+	start_row		=  (start_strip / raid->rowDataSize);
+	endRow			=  (endStrip  / raid->rowDataSize);
+	/* get the row count */
+	numRows			= (U8)(endRow - start_row + 1);
+
+	/*
+	 * calculate region info.
+	 */
+	regStart	= start_row << raid->stripeShift;
+	regSize		= stripSize;
+
+	/* Check if we can send this I/O via FastPath */
+	if (raid->capability.fpCapable) {
+		if (isRead)
+			io_info->fpOkForIo = (raid->capability.fpReadCapable &&
+			    ((num_strips == 1) ||
+			    raid->capability.fpReadAcrossStripe));
+		else
+			io_info->fpOkForIo =
+			    (raid->capability.fpWriteCapable &&
+			    ((num_strips == 1) ||
+			    raid->capability.fpWriteAcrossStripe));
+	} else
+		io_info->fpOkForIo = FALSE;
+
+
+	/*
+	 * Check for DIF support
+	 */
+	if (!raid->capability.ldPiMode) {
+		io_info->ldPI = FALSE;
+	} else {
+		io_info->ldPI = TRUE;
+	}
+
+	if (numRows == 1) {
+		if (num_strips == 1) {
+			regStart += ref_in_start_stripe;
+			regSize = numBlocks;
+		}
+	} else {
+		if (start_strip == (start_row + 1) * raid->rowDataSize - 1) {
+			regStart += ref_in_start_stripe;
+		regSize = stripSize - ref_in_start_stripe;
+		}
+
+		if (numRows > 2) {
+			regSize += (numRows-2) << raid->stripeShift;
+		}
+
+		if (endStrip == endRow*raid->rowDataSize) {
+			regSize += ref_in_end_stripe+1;
+		} else {
+			regSize += stripSize;
+		}
+	}
+
+	pRAID_Context->timeoutValue = map->raidMap.fpPdIoTimeoutSec;
+
+	if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+		pRAID_Context->regLockFlags = (isRead) ?
+		    raid->regTypeReqOnRead : raid->regTypeReqOnWrite;
+	} else {
+		pRAID_Context->regLockFlags = (isRead) ?
+		    REGION_TYPE_SHARED_READ : raid->regTypeReqOnWrite;
+	}
+
+	pRAID_Context->ldTargetId = raid->targetId;
+	pRAID_Context->regLockRowLBA = regStart;
+	pRAID_Context->regLockLength = regSize;
+	pRAID_Context->configSeqNum = raid->seqNum;
+
+	/*
+	 * Get Phy Params only if FP capable,
+	 * or else leave it to MR firmware to do the calculation.
+	 */
+	if (io_info->fpOkForIo) {
+		/* if fast path possible then get the physical parameters */
+		retval = MR_GetPhyParams(instance, ld, start_strip,
+		    ref_in_start_stripe, &io_info->pdBlock,
+		    &io_info->devHandle, pRAID_Context, map);
+
+		/* If IO on an invalid Pd, then FP is not possible. */
+		if (io_info->devHandle == MR_PD_INVALID)
+			io_info->fpOkForIo = FALSE;
+
+		return (retval);
+
+	} else if (isRead) {
+		uint_t stripIdx;
+
+		for (stripIdx = 0; stripIdx < num_strips; stripIdx++) {
+			if (!MR_GetPhyParams(instance, ld,
+			    start_strip + stripIdx, ref_in_start_stripe,
+			    &io_info->pdBlock, &io_info->devHandle,
+			    pRAID_Context, map)) {
+				return (TRUE);
+			}
+		}
+	}
+	return (TRUE);
+}
+
+
+void
+mr_update_load_balance_params(MR_FW_RAID_MAP_ALL *map,
+    PLD_LOAD_BALANCE_INFO lbInfo)
+{
+	int ldCount;
+	U16 ld;
+	MR_LD_RAID *raid;
+
+	for (ldCount = 0; ldCount < MAX_LOGICAL_DRIVES; ldCount++) {
+		ld = MR_TargetIdToLdGet(ldCount, map);
+
+		if (ld >= MAX_LOGICAL_DRIVES) {
+			con_log(CL_ANN1,
+			    (CE_NOTE, "mrsas: ld=%d Invalid ld \n", ld));
+			continue;
+		}
+
+		raid = MR_LdRaidGet(ld, map);
+
+		/* Two drive Optimal RAID 1 */
+		if ((raid->level == 1) && (raid->rowSize == 2) &&
+		    (raid->spanDepth == 1) &&
+		    raid->ldState == MR_LD_STATE_OPTIMAL) {
+			U32 pd, arRef;
+
+			lbInfo[ldCount].loadBalanceFlag = 1;
+
+			/* Get the array on which this span is present. */
+			arRef = MR_LdSpanArrayGet(ld, 0, map);
+
+			pd = MR_ArPdGet(arRef, 0, map);	    /* Get the Pd. */
+			/* Get dev handle from Pd. */
+			lbInfo[ldCount].raid1DevHandle[0] =
+			    MR_PdDevHandleGet(pd, map);
+
+			pd = MR_ArPdGet(arRef, 1, map);	    /* Get the Pd. */
+			/* Get dev handle from Pd. */
+			lbInfo[ldCount].raid1DevHandle[1] =
+			    MR_PdDevHandleGet(pd, map);
+			con_log(CL_ANN1, (CE_NOTE,
+			    "mrsas: ld=%d load balancing enabled \n", ldCount));
+		} else {
+			lbInfo[ldCount].loadBalanceFlag = 0;
+		}
+	}
+}
+
+
+U8
+megasas_get_best_arm(PLD_LOAD_BALANCE_INFO lbInfo, U8 arm, U64 block,
+    U32 count)
+{
+	U16 pend0, pend1;
+	U64 diff0, diff1;
+	U8 bestArm;
+
+	/* get the pending cmds for the data and mirror arms */
+	pend0 = lbInfo->scsi_pending_cmds[0];
+	pend1 = lbInfo->scsi_pending_cmds[1];
+
+	/* Determine the disk whose head is nearer to the req. block */
+	diff0 = ABS_DIFF(block, lbInfo->last_accessed_block[0]);
+	diff1 = ABS_DIFF(block, lbInfo->last_accessed_block[1]);
+	bestArm = (diff0 <= diff1 ? 0 : 1);
+
+	if ((bestArm == arm && pend0 > pend1 + 16) ||
+	    (bestArm != arm && pend1 > pend0 + 16)) {
+		bestArm ^= 1;
+	}
+
+	/* Update the last accessed block on the correct pd */
+	lbInfo->last_accessed_block[bestArm] = block + count - 1;
+	return (bestArm);
+}
+
+U16
+get_updated_dev_handle(PLD_LOAD_BALANCE_INFO lbInfo,
+    struct IO_REQUEST_INFO *io_info)
+{
+	U8 arm, old_arm;
+	U16 devHandle;
+
+	old_arm = lbInfo->raid1DevHandle[0] == io_info->devHandle ? 0 : 1;
+
+	/* get best new arm */
+	arm  = megasas_get_best_arm(lbInfo, old_arm, io_info->ldStartBlock,
+	    io_info->numBlocks);
+
+	devHandle = lbInfo->raid1DevHandle[arm];
+
+	lbInfo->scsi_pending_cmds[arm]++;
+
+	return (devHandle);
+}
diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.h b/usr/src/uts/common/io/mr_sas/ld_pd_map.h
new file mode 100644
index 0000000000..dc6f0ce957
--- /dev/null
+++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.h
@@ -0,0 +1,249 @@
+/*
+ * ld_pd_map.h
+ *
+ * Solaris MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Swaminathan K S
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rasheed
+ *		Shakeel Bukhari
+ */
+
+#ifndef _LD_PD_MAP
+#define	_LD_PD_MAP
+#include <sys/scsi/scsi.h>
+#include "fusion.h"
+
+struct mrsas_instance;	/* This will be defined in mr_sas.h */
+
+/* raid->write_mode; raid->read_ahead; dcmd->state */
+/* Write through */
+#define	WRITE_THROUGH				0
+/* Delayed Write */
+#define	WRITE_BACK				1
+
+/* SCSI CDB definitions */
+#define	READ_6		0x08
+#define	READ_16		0x88
+#define	READ_10		0x28
+#define	READ_12		0xA8
+#define	WRITE_16	0x8A
+#define	WRITE_10	0x2A
+
+/* maximum disks per array */
+#define	MAX_ROW_SIZE				32
+/* maximum spans per logical drive */
+#define	MAX_SPAN_DEPTH				8
+#define	MEGASAS_LOAD_BALANCE_FLAG		0x1
+#define	MR_DEFAULT_IO_TIMEOUT	20
+
+
+union desc_value {
+	U64 word;
+	struct {
+		U32 low;
+		U32 high;
+	} u1;
+};
+
+typedef struct _LD_LOAD_BALANCE_INFO
+{
+    U8	    loadBalanceFlag;
+    U8	    reserved1;
+    U16	    raid1DevHandle[2];
+    U16	    scsi_pending_cmds[2];
+    U64	    last_accessed_block[2];
+} LD_LOAD_BALANCE_INFO, *PLD_LOAD_BALANCE_INFO;
+
+#pragma pack(1)
+typedef struct _MR_FW_RAID_MAP_ALL {
+	MR_FW_RAID_MAP raidMap;
+	MR_LD_SPAN_MAP ldSpanMap[MAX_LOGICAL_DRIVES - 1];
+} MR_FW_RAID_MAP_ALL;
+
+/*
+ * Raid Context structure which describes MegaRAID specific IO Paramenters
+ * This resides at offset 0x60 where the SGL normally starts in MPT IO Frames
+ */
+typedef struct _MPI2_SCSI_IO_VENDOR_UNIQUE {
+	U8 nsegType;		/* 0x00 nseg[7:4], Type[3:0] */
+	U8 resvd0;		/* 0x01 */
+	U16 timeoutValue;	/* 0x02 -0x03 */
+	U8 regLockFlags;	/* 0x04 */
+	U8 reservedForHw1;	/* 0x05 */
+	U16 ldTargetId;		/* 0x06 - 0x07 */
+	U64 regLockRowLBA;	/* 0x08 - 0x0F */
+	U32 regLockLength;	/* 0x10 - 0x13 */
+	U16 nextLMId;		/* 0x14 - 0x15 */
+	U8 extStatus;		/* 0x16 */
+	U8 status;		/* 0x17 status */
+	U8 RAIDFlags;		/* 0x18 resvd[7:6], ioSubType[5:4], */
+				/* resvd[3:1], preferredCpu[0] */
+	U8 numSGE;		/* 0x19 numSge; not including chain entries */
+	U16 configSeqNum;	/* 0x1A -0x1B */
+	U8 spanArm;		/* 0x1C span[7:5], arm[4:0] */
+	U8 resvd2[3];		/* 0x1D-0x1f */
+} MPI2_SCSI_IO_VENDOR_UNIQUE, MPI25_SCSI_IO_VENDOR_UNIQUE;
+
+#define	RAID_CTX_SPANARM_ARM_SHIFT	(0)
+#define	RAID_CTX_SPANARM_ARM_MASK	(0x1f)
+
+#define	RAID_CTX_SPANARM_SPAN_SHIFT	(5)
+#define	RAID_CTX_SPANARM_SPAN_MASK	(0xE0)
+
+
+/*
+ * RAID SCSI IO Request Message
+ * Total SGE count will be one less
+ * than	 _MPI2_SCSI_IO_REQUEST
+ */
+typedef struct _MPI2_RAID_SCSI_IO_REQUEST
+{
+	uint16_t		DevHandle;			/* 0x00 */
+	uint8_t			ChainOffset;			/* 0x02 */
+	uint8_t			Function;			/* 0x03 */
+	uint16_t		Reserved1;			/* 0x04 */
+	uint8_t			Reserved2;			/* 0x06 */
+	uint8_t			MsgFlags;			/* 0x07 */
+	uint8_t			VP_ID;				/* 0x08 */
+	uint8_t			VF_ID;				/* 0x09 */
+	uint16_t		Reserved3;			/* 0x0A */
+	uint32_t		SenseBufferLowAddress;		/* 0x0C */
+	uint16_t		SGLFlags;			/* 0x10 */
+	uint8_t			SenseBufferLength;		/* 0x12 */
+	uint8_t			Reserved4;			/* 0x13 */
+	uint8_t			SGLOffset0;			/* 0x14 */
+	uint8_t			SGLOffset1;			/* 0x15 */
+	uint8_t			SGLOffset2;			/* 0x16 */
+	uint8_t			SGLOffset3;			/* 0x17 */
+	uint32_t		SkipCount;			/* 0x18 */
+	uint32_t		DataLength;			/* 0x1C */
+	uint32_t		BidirectionalDataLength;	/* 0x20 */
+	uint16_t		IoFlags;			/* 0x24 */
+	uint16_t		EEDPFlags;			/* 0x26 */
+	uint32_t		EEDPBlockSize;			/* 0x28 */
+	uint32_t		SecondaryReferenceTag;		/* 0x2C */
+	uint16_t		SecondaryApplicationTag;	/* 0x30 */
+	uint16_t		ApplicationTagTranslationMask;	/* 0x32 */
+	uint8_t			LUN[8];				/* 0x34 */
+	uint32_t		Control;			/* 0x3C */
+	Mpi2ScsiIoCdb_t		CDB;				/* 0x40 */
+	MPI2_SCSI_IO_VENDOR_UNIQUE RaidContext;			/* 0x60 */
+	Mpi2SGEIOUnion_t	SGL; /* 0x80 */
+} MPI2_RAID_SCSI_IO_REQUEST, MPI2_POINTER PTR_MPI2_RAID_SCSI_IO_REQUEST,
+Mpi2RaidSCSIIORequest_t, MPI2_POINTER pMpi2RaidSCSIIORequest_t;
+
+/*
+ * define region lock types
+ */
+typedef enum	_REGION_TYPE {
+	REGION_TYPE_UNUSED	= 0,	/* lock is currently not active */
+	REGION_TYPE_SHARED_READ	= 1,	/* shared lock (for reads) */
+	REGION_TYPE_SHARED_WRITE = 2,
+	REGION_TYPE_EXCLUSIVE	= 3	/* exclusive lock (for writes) */
+} REGION_TYPE;
+
+
+#define	DM_PATH_MAXPATH		2
+#define	DM_PATH_FIRSTPATH	0
+#define	DM_PATH_SECONDPATH	1
+
+/* declare valid Region locking values */
+typedef enum _REGION_LOCK {
+	REGION_LOCK_BYPASS		= 0,
+	/* for RAID 6 single-drive failure */
+	REGION_LOCK_UNCOND_SHARED_READ	= 1,
+	REGION_LOCK_UNCOND_SHARED_WRITE	= 2,
+	REGION_LOCK_UNCOND_SHARED_OTHER	= 3,
+	REGION_LOCK_UNCOND_SHARED_EXCLUSIVE = 0xFF
+} REGION_LOCK;
+
+
+struct mrsas_init_frame2 {
+	uint8_t	cmd;				/* 00h */
+	uint8_t reserved_0;			/* 01h */
+	uint8_t cmd_status;			/* 02h */
+
+	uint8_t reserved_1;			/* 03h */
+	uint32_t reserved_2;			/* 04h */
+
+	uint32_t context;			/* 08h */
+	uint32_t pad_0;				/* 0Ch */
+
+	uint16_t flags;				/* 10h */
+	uint16_t reserved_3;			/* 12h */
+	uint32_t data_xfer_len;			/* 14h */
+
+	uint32_t queue_info_new_phys_addr_lo;	/* 18h */
+	uint32_t queue_info_new_phys_addr_hi;	/* 1Ch */
+	uint32_t queue_info_old_phys_addr_lo;	/* 20h */
+	uint32_t queue_info_old_phys_addr_hi;	/* 24h */
+	uint64_t	driverversion;		/* 28h */
+	uint32_t reserved_4[4];			/* 30h */
+};
+
+
+/*
+ * Request descriptor types
+ */
+#define	MPI2_REQ_DESCRIPT_FLAGS_LD_IO		0x7
+#define	MPI2_REQ_DESCRIPT_FLAGS_MFA		0x1
+#define	MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK		0x2
+
+#define	MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT	1
+
+
+/*
+ * MPT RAID MFA IO Descriptor.
+ */
+typedef struct _MR_RAID_MFA_IO_DESCRIPTOR {
+	uint32_t	RequestFlags : 8;
+	uint32_t	MessageAddress1 : 24;	/* bits 31:8 */
+	uint32_t	MessageAddress2;	/* bits 61:32 */
+} MR_RAID_MFA_IO_REQUEST_DESCRIPTOR,
+*PMR_RAID_MFA_IO_REQUEST_DESCRIPTOR;
+
+/* union of Request Descriptors */
+typedef union _MRSAS_REQUEST_DESCRIPTOR_UNION
+{
+	MPI2_DEFAULT_REQUEST_DESCRIPTOR		Default;
+	MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR	HighPriority;
+	MPI2_SCSI_IO_REQUEST_DESCRIPTOR		SCSIIO;
+	MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR	SCSITarget;
+	MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR	RAIDAccelerator;
+	MR_RAID_MFA_IO_REQUEST_DESCRIPTOR	MFAIo;
+	U64 Words;
+} MRSAS_REQUEST_DESCRIPTOR_UNION;
+
+#pragma pack()
+
+enum {
+	MRSAS_SCSI_VARIABLE_LENGTH_CMD		= 0x7F,
+	MRSAS_SCSI_SERVICE_ACTION_READ32	= 0x9,
+	MRSAS_SCSI_SERVICE_ACTION_WRITE32	= 0xB,
+	MRSAS_SCSI_ADDL_CDB_LEN			= 0x18,
+	MRSAS_RD_WR_PROTECT			= 0x20,
+	MRSAS_EEDPBLOCKSIZE			= 512
+};
+
+
+#define	IEEE_SGE_FLAGS_ADDR_MASK	(0x03)
+#define	IEEE_SGE_FLAGS_SYSTEM_ADDR	(0x00)
+#define	IEEE_SGE_FLAGS_IOCDDR_ADDR	(0x01)
+#define	IEEE_SGE_FLAGS_IOCPLB_ADDR	(0x02)
+#define	IEEE_SGE_FLAGS_IOCPLBNTA_ADDR	(0x03)
+#define	IEEE_SGE_FLAGS_CHAIN_ELEMENT	(0x80)
+#define	IEEE_SGE_FLAGS_END_OF_LIST	(0x40)
+
+
+U8 MR_ValidateMapInfo(MR_FW_RAID_MAP_ALL *map, PLD_LOAD_BALANCE_INFO lbInfo);
+U16 MR_CheckDIF(U32, MR_FW_RAID_MAP_ALL *);
+U8 MR_BuildRaidContext(struct mrsas_instance *, struct IO_REQUEST_INFO *,
+    MPI2_SCSI_IO_VENDOR_UNIQUE *, MR_FW_RAID_MAP_ALL *);
+
+#endif /* _LD_PD_MAP */
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.c b/usr/src/uts/common/io/mr_sas/mr_sas.c
index 922fc78f8d..05fecff694 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.c
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.c
@@ -1,16 +1,17 @@
 /*
  * mr_sas.c: source for mr_sas driver
  *
- * MegaRAID device driver for SAS2.0 controllers
- * Copyright (c) 2008-2010, LSI Logic Corporation.
+ * Solaris MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
  * All rights reserved.
  *
  * Version:
  * Author:
+ *		Swaminathan K S
  *		Arun Chandrashekhar
  *		Manju R
- *		Rajesh Prabhakaran
- * 		Seokmann Ju
+ *		Rasheed
+ *		Shakeel Bukhari
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,6 +44,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright 2012 Nexenta System, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -83,29 +85,158 @@
  */
 static void	*mrsas_state = NULL;
 static volatile boolean_t	mrsas_relaxed_ordering = B_TRUE;
-static volatile int 	debug_level_g = CL_NONE;
-static volatile int 	msi_enable = 1;
+volatile int	debug_level_g = CL_NONE;
+static volatile int	msi_enable = 1;
 static volatile int 	ctio_enable = 1;
 
 /* Default Timeout value to issue online controller reset */
-static volatile int  debug_timeout_g  = 0xB4;
+volatile int  debug_timeout_g  = 0xF0;		/* 0xB4; */
 /* Simulate consecutive firmware fault */
 static volatile int  debug_fw_faults_after_ocr_g  = 0;
-
 #ifdef OCRDEBUG
 /* Simulate three consecutive timeout for an IO */
 static volatile int  debug_consecutive_timeout_after_ocr_g  = 0;
 #endif
 
+#if 0
+/* Enable OCR on firmware fault */
+static volatile int  debug_support_ocr_isr_g  = 0;
+#endif
 #pragma weak scsi_hba_open
 #pragma weak scsi_hba_close
 #pragma weak scsi_hba_ioctl
 
-static ddi_dma_attr_t mrsas_generic_dma_attr = {
+/* Local static prototypes. */
+static int	mrsas_getinfo(dev_info_t *, ddi_info_cmd_t,  void *, void **);
+static int	mrsas_attach(dev_info_t *, ddi_attach_cmd_t);
+#ifdef __sparc
+static int	mrsas_reset(dev_info_t *, ddi_reset_cmd_t);
+#else
+static int	mrsas_quiesce(dev_info_t *);
+#endif
+static int	mrsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int	mrsas_open(dev_t *, int, int, cred_t *);
+static int	mrsas_close(dev_t, int, int, cred_t *);
+static int	mrsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int	mrsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *mrsas_tran_init_pkt(struct scsi_address *, register
+		    struct scsi_pkt *, struct buf *, int, int, int, int,
+		    int (*)(), caddr_t);
+static int	mrsas_tran_start(struct scsi_address *,
+		    register struct scsi_pkt *);
+static int	mrsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int	mrsas_tran_reset(struct scsi_address *, int);
+#if 0
+static int	mrsas_tran_bus_reset(dev_info_t *, int);
+#endif
+static int	mrsas_tran_getcap(struct scsi_address *, char *, int);
+static int	mrsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void	mrsas_tran_destroy_pkt(struct scsi_address *,
+		    struct scsi_pkt *);
+static void	mrsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void	mrsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static int	mrsas_tran_quiesce(dev_info_t *dip);
+static int	mrsas_tran_unquiesce(dev_info_t *dip);
+static uint_t	mrsas_isr();
+static uint_t	mrsas_softintr();
+static void	mrsas_undo_resources(dev_info_t *, struct mrsas_instance *);
+static struct mrsas_cmd *get_mfi_pkt(struct mrsas_instance *);
+static void	return_mfi_pkt(struct mrsas_instance *,
+		    struct mrsas_cmd *);
+
+static void	free_space_for_mfi(struct mrsas_instance *);
+static uint32_t	read_fw_status_reg_ppc(struct mrsas_instance *);
+static void	issue_cmd_ppc(struct mrsas_cmd *, struct mrsas_instance *);
+static int	issue_cmd_in_poll_mode_ppc(struct mrsas_instance *,
+		    struct mrsas_cmd *);
+static int	issue_cmd_in_sync_mode_ppc(struct mrsas_instance *,
+		    struct mrsas_cmd *);
+static void	enable_intr_ppc(struct mrsas_instance *);
+static void	disable_intr_ppc(struct mrsas_instance *);
+static int	intr_ack_ppc(struct mrsas_instance *);
+static void	flush_cache(struct mrsas_instance *instance);
+void	display_scsi_inquiry(caddr_t);
+static int	start_mfi_aen(struct mrsas_instance *instance);
+static int	handle_drv_ioctl(struct mrsas_instance *instance,
+		    struct mrsas_ioctl *ioctl, int mode);
+static int	handle_mfi_ioctl(struct mrsas_instance *instance,
+		    struct mrsas_ioctl *ioctl, int mode);
+static int	handle_mfi_aen(struct mrsas_instance *instance,
+		    struct mrsas_aen *aen);
+static struct mrsas_cmd *build_cmd(struct mrsas_instance *,
+    struct scsi_address *, struct scsi_pkt *, uchar_t *);
+static int	alloc_additional_dma_buffer(struct mrsas_instance *);
+static void	complete_cmd_in_sync_mode(struct mrsas_instance *,
+		struct mrsas_cmd *);
+static int	mrsas_kill_adapter(struct mrsas_instance *);
+static int	mrsas_issue_init_mfi(struct mrsas_instance *);
+static int	mrsas_reset_ppc(struct mrsas_instance *);
+static uint32_t mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *);
+static int	wait_for_outstanding(struct mrsas_instance *instance);
+static int	register_mfi_aen(struct mrsas_instance *instance,
+		    uint32_t seq_num, uint32_t class_locale_word);
+static int	issue_mfi_pthru(struct mrsas_instance *instance, struct
+		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
+static int	issue_mfi_dcmd(struct mrsas_instance *instance, struct
+		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
+static int	issue_mfi_smp(struct mrsas_instance *instance, struct
+		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
+static int	issue_mfi_stp(struct mrsas_instance *instance, struct
+		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
+static int	abort_aen_cmd(struct mrsas_instance *instance,
+		    struct mrsas_cmd *cmd_to_abort);
+
+static void	mrsas_rem_intrs(struct mrsas_instance *instance);
+static int	mrsas_add_intrs(struct mrsas_instance *instance, int intr_type);
+
+static void	mrsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+		    scsi_hba_tran_t *, struct scsi_device *);
+static int	mrsas_tran_bus_config(dev_info_t *, uint_t,
+		    ddi_bus_config_op_t, void *, dev_info_t **);
+static int	mrsas_parse_devname(char *, int *, int *);
+static int	mrsas_config_all_devices(struct mrsas_instance *);
+static int	mrsas_config_ld(struct mrsas_instance *, uint16_t,
+			uint8_t, dev_info_t **);
+static int	mrsas_name_node(dev_info_t *, char *, int);
+static void	mrsas_issue_evt_taskq(struct mrsas_eventinfo *);
+static void	free_additional_dma_buffer(struct mrsas_instance *);
+static void io_timeout_checker(void *);
+static void mrsas_fm_init(struct mrsas_instance *);
+static void mrsas_fm_fini(struct mrsas_instance *);
+
+static struct mrsas_function_template mrsas_function_template_ppc = {
+	.read_fw_status_reg = read_fw_status_reg_ppc,
+	.issue_cmd = issue_cmd_ppc,
+	.issue_cmd_in_sync_mode = issue_cmd_in_sync_mode_ppc,
+	.issue_cmd_in_poll_mode = issue_cmd_in_poll_mode_ppc,
+	.enable_intr = enable_intr_ppc,
+	.disable_intr = disable_intr_ppc,
+	.intr_ack = intr_ack_ppc,
+	.init_adapter = mrsas_init_adapter_ppc
+/*	.reset_adapter = mrsas_reset_adapter_ppc */
+};
+
+
+static struct mrsas_function_template mrsas_function_template_fusion = {
+	.read_fw_status_reg = tbolt_read_fw_status_reg,
+	.issue_cmd = tbolt_issue_cmd,
+	.issue_cmd_in_sync_mode = tbolt_issue_cmd_in_sync_mode,
+	.issue_cmd_in_poll_mode = tbolt_issue_cmd_in_poll_mode,
+	.enable_intr = tbolt_enable_intr,
+	.disable_intr = tbolt_disable_intr,
+	.intr_ack = tbolt_intr_ack,
+	.init_adapter = mrsas_init_adapter_tbolt
+/*	.reset_adapter = mrsas_reset_adapter_tbolt */
+};
+
+
+ddi_dma_attr_t mrsas_generic_dma_attr = {
 	DMA_ATTR_V0,		/* dma_attr_version */
 	0,			/* low DMA address range */
 	0xFFFFFFFFU,		/* high DMA address range */
-	0xFFFFFFFFU,		/* DMA counter register  */
+	0xFFFFFFFFU,		/* DMA counter register	 */
 	8,			/* DMA address alignment */
 	0x07,			/* DMA burstsizes  */
 	1,			/* min DMA size */
@@ -119,6 +250,12 @@ static ddi_dma_attr_t mrsas_generic_dma_attr = {
 int32_t mrsas_max_cap_maxxfer = 0x1000000;
 
 /*
+ * Fix for: Thunderbolt controller IO timeout when IO write size is 1MEG,
+ * Limit size to 256K
+ */
+uint32_t mrsas_tbolt_max_cap_maxxfer = (512 * 512);
+
+/*
  * cb_ops contains base level routines
  */
 static struct cb_ops mrsas_cb_ops = {
@@ -153,19 +290,20 @@ static struct dev_ops mrsas_ops = {
 	nulldev,		/* probe */
 	mrsas_attach,		/* attach */
 	mrsas_detach,		/* detach */
-#ifdef  __sparc
+#ifdef	__sparc
 	mrsas_reset,		/* reset */
 #else	/* __sparc */
 	nodev,
-#endif  /* __sparc */
+#endif	/* __sparc */
 	&mrsas_cb_ops,		/* char/block ops */
 	NULL,			/* bus ops */
 	NULL,			/* power */
-#ifdef	__sparc
+#ifdef __sparc
 	ddi_quiesce_not_needed
 #else	/* __sparc */
-	mrsas_quiesce		/* quiesce */
+	mrsas_quiesce	/* quiesce */
 #endif	/* __sparc */
+
 };
 
 static struct modldrv modldrv = {
@@ -188,14 +326,28 @@ static struct ddi_device_acc_attr endian_attr = {
 };
 
 
+unsigned int enable_fp = 1;
+
+
 /*
  * ************************************************************************** *
- *                                                                            *
- *         common entry points - for loadable kernel modules                  *
- *                                                                            *
+ *									      *
+ *	   common entry points - for loadable kernel modules		      *
+ *									      *
  * ************************************************************************** *
  */
 
+/*
+ * _init - initialize a loadable module
+ * @void
+ *
+ * The driver should perform any one-time resource allocation or data
+ * initialization during driver loading in _init(). For example, the driver
+ * should initialize any mutexes global to the driver in this routine.
+ * The driver should not, however, use _init() to allocate or initialize
+ * anything that has to do with a particular instance of the device.
+ * Per-instance initialization must be done in attach().
+ */
 int
 _init(void)
 {
@@ -207,12 +359,12 @@ _init(void)
 	    sizeof (struct mrsas_instance), 0);
 
 	if (ret != DDI_SUCCESS) {
-		con_log(CL_ANN, (CE_WARN, "mr_sas: could not init state"));
+		cmn_err(CE_WARN, "mr_sas: could not init state");
 		return (ret);
 	}
 
 	if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
-		con_log(CL_ANN, (CE_WARN, "mr_sas: could not init scsi hba"));
+		cmn_err(CE_WARN, "mr_sas: could not init scsi hba");
 		ddi_soft_state_fini(&mrsas_state);
 		return (ret);
 	}
@@ -220,7 +372,7 @@ _init(void)
 	ret = mod_install(&modlinkage);
 
 	if (ret != DDI_SUCCESS) {
-		con_log(CL_ANN, (CE_WARN, "mr_sas: mod_install failed"));
+		cmn_err(CE_WARN, "mr_sas: mod_install failed");
 		scsi_hba_fini(&modlinkage);
 		ddi_soft_state_fini(&mrsas_state);
 	}
@@ -228,6 +380,13 @@ _init(void)
 	return (ret);
 }
 
+/*
+ * _info - returns information about a loadable module.
+ * @void
+ *
+ * _info() is called to return module information. This is a typical entry
+ * point that does predefined role. It simply calls mod_info().
+ */
 int
 _info(struct modinfo *modinfop)
 {
@@ -236,6 +395,13 @@ _info(struct modinfo *modinfop)
 	return (mod_info(&modlinkage, modinfop));
 }
 
+/*
+ * _fini - prepare a loadable module for unloading
+ * @void
+ *
+ * In _fini(), the driver should release any resources that were allocated in
+ * _init(). The driver must remove itself from the system module list.
+ */
 int
 _fini(void)
 {
@@ -243,12 +409,17 @@ _fini(void)
 
 	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
-	if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+	if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) {
+		con_log(CL_ANN1,
+		    (CE_WARN, "_fini: mod_remove() failed, error 0x%X", ret));
 		return (ret);
+	}
 
 	scsi_hba_fini(&modlinkage);
+	con_log(CL_DLEVEL1, (CE_NOTE, "_fini: scsi_hba_fini() done."));
 
 	ddi_soft_state_fini(&mrsas_state);
+	con_log(CL_DLEVEL1, (CE_NOTE, "_fini: ddi_soft_state_fini() done."));
 
 	return (ret);
 }
@@ -256,24 +427,41 @@ _fini(void)
 
 /*
  * ************************************************************************** *
- *                                                                            *
- *               common entry points - for autoconfiguration                  *
- *                                                                            *
+ *									      *
+ *		 common entry points - for autoconfiguration		      *
+ *									      *
  * ************************************************************************** *
  */
-
+/*
+ * attach - adds a device to the system as part of initialization
+ * @dip:
+ * @cmd:
+ *
+ * The kernel calls a driver's attach() entry point to attach an instance of
+ * a device (for MegaRAID, it is instance of a controller) or to resume
+ * operation for an instance of a device that has been suspended or has been
+ * shut down by the power management framework
+ * The attach() entry point typically includes the following types of
+ * processing:
+ * - allocate a soft-state structure for the device instance (for MegaRAID,
+ *   controller instance)
+ * - initialize per-instance mutexes
+ * - initialize condition variables
+ * - register the device's interrupts (for MegaRAID, controller's interrupts)
+ * - map the registers and memory of the device instance (for MegaRAID,
+ *   controller instance)
+ * - create minor device nodes for the device instance (for MegaRAID,
+ *   controller instance)
+ * - report that the device instance (for MegaRAID, controller instance) has
+ *   attached
+ */
 static int
 mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	int		instance_no;
 	int		nregs;
-	uint8_t		added_isr_f = 0;
-	uint8_t		added_soft_isr_f = 0;
-	uint8_t		create_devctl_node_f = 0;
-	uint8_t		create_scsi_node_f = 0;
-	uint8_t		create_ioc_node_f = 0;
-	uint8_t		tran_alloc_f = 0;
-	uint8_t 	irq;
+	int		i = 0;
+	uint8_t		irq;
 	uint16_t	vendor_id;
 	uint16_t	device_id;
 	uint16_t	subsysvid;
@@ -284,7 +472,7 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	char		*data;
 
 	scsi_hba_tran_t		*tran;
-	ddi_dma_attr_t  tran_dma_attr;
+	ddi_dma_attr_t	tran_dma_attr;
 	struct mrsas_instance	*instance;
 
 	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
@@ -298,481 +486,533 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	 * check to see whether this device is in a DMA-capable slot.
 	 */
 	if (ddi_slaveonly(dip) == DDI_SUCCESS) {
-		con_log(CL_ANN, (CE_WARN,
+		cmn_err(CE_WARN,
 		    "mr_sas%d: Device in slave-only slot, unused",
-		    instance_no));
+		    instance_no);
 		return (DDI_FAILURE);
 	}
 
 	switch (cmd) {
-		case DDI_ATTACH:
-			con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: DDI_ATTACH"));
-			/* allocate the soft state for the instance */
-			if (ddi_soft_state_zalloc(mrsas_state, instance_no)
-			    != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas%d: Failed to allocate soft state",
-				    instance_no));
+	case DDI_ATTACH:
+		/* allocate the soft state for the instance */
+		if (ddi_soft_state_zalloc(mrsas_state, instance_no)
+		    != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "mr_sas%d: Failed to allocate soft state",
+			    instance_no);
+			return (DDI_FAILURE);
+		}
 
-				return (DDI_FAILURE);
-			}
+		instance = (struct mrsas_instance *)ddi_get_soft_state
+		    (mrsas_state, instance_no);
 
-			instance = (struct mrsas_instance *)ddi_get_soft_state
-			    (mrsas_state, instance_no);
+		if (instance == NULL) {
+			cmn_err(CE_WARN,
+			    "mr_sas%d: Bad soft state", instance_no);
+			ddi_soft_state_free(mrsas_state, instance_no);
+			return (DDI_FAILURE);
+		}
 
-			if (instance == NULL) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas%d: Bad soft state", instance_no));
+		bzero(instance, sizeof (struct mrsas_instance));
 
-				ddi_soft_state_free(mrsas_state, instance_no);
+		instance->unroll.softs	= 1;
 
-				return (DDI_FAILURE);
-			}
+		/* Setup the PCI configuration space handles */
+		if (pci_config_setup(dip, &instance->pci_handle) !=
+		    DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "mr_sas%d: pci config setup failed ",
+			    instance_no);
+
+			ddi_soft_state_free(mrsas_state, instance_no);
+			return (DDI_FAILURE);
+		}
+		if (instance->pci_handle == NULL) {
+			cmn_err(CE_WARN,
+			    "mr_sas%d: pci config setup failed ",
+			    instance_no);
+			ddi_soft_state_free(mrsas_state, instance_no);
+			return (DDI_FAILURE);
+		}
 
-			bzero((caddr_t)instance,
-			    sizeof (struct mrsas_instance));
 
-			instance->func_ptr = kmem_zalloc(
-			    sizeof (struct mrsas_func_ptr), KM_SLEEP);
-			ASSERT(instance->func_ptr);
 
-			/* Setup the PCI configuration space handles */
-			if (pci_config_setup(dip, &instance->pci_handle) !=
-			    DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas%d: pci config setup failed ",
-				    instance_no));
+		if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to get registers.");
 
-				kmem_free(instance->func_ptr,
-				    sizeof (struct mrsas_func_ptr));
-				ddi_soft_state_free(mrsas_state, instance_no);
+			pci_config_teardown(&instance->pci_handle);
+			ddi_soft_state_free(mrsas_state, instance_no);
+			return (DDI_FAILURE);
+		}
 
-				return (DDI_FAILURE);
-			}
+		vendor_id = pci_config_get16(instance->pci_handle,
+		    PCI_CONF_VENID);
+		device_id = pci_config_get16(instance->pci_handle,
+		    PCI_CONF_DEVID);
 
-			if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to get registers."));
+		subsysvid = pci_config_get16(instance->pci_handle,
+		    PCI_CONF_SUBVENID);
+		subsysid = pci_config_get16(instance->pci_handle,
+		    PCI_CONF_SUBSYSID);
 
-				pci_config_teardown(&instance->pci_handle);
-				kmem_free(instance->func_ptr,
-				    sizeof (struct mrsas_func_ptr));
-				ddi_soft_state_free(mrsas_state, instance_no);
+		pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+		    (pci_config_get16(instance->pci_handle,
+		    PCI_CONF_COMM) | PCI_COMM_ME));
+		irq = pci_config_get8(instance->pci_handle,
+		    PCI_CONF_ILINE);
 
-				return (DDI_FAILURE);
-			}
+		con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: "
+		    "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+		    instance_no, vendor_id, device_id, subsysvid,
+		    subsysid, irq, MRSAS_VERSION));
 
-			vendor_id = pci_config_get16(instance->pci_handle,
-			    PCI_CONF_VENID);
-			device_id = pci_config_get16(instance->pci_handle,
-			    PCI_CONF_DEVID);
+		/* enable bus-mastering */
+		command = pci_config_get16(instance->pci_handle,
+		    PCI_CONF_COMM);
 
-			subsysvid = pci_config_get16(instance->pci_handle,
-			    PCI_CONF_SUBVENID);
-			subsysid = pci_config_get16(instance->pci_handle,
-			    PCI_CONF_SUBSYSID);
+		if (!(command & PCI_COMM_ME)) {
+			command |= PCI_COMM_ME;
 
-			pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
-			    (pci_config_get16(instance->pci_handle,
-			    PCI_CONF_COMM) | PCI_COMM_ME));
-			irq = pci_config_get8(instance->pci_handle,
-			    PCI_CONF_ILINE);
+			pci_config_put16(instance->pci_handle,
+			    PCI_CONF_COMM, command);
 
+			con_log(CL_ANN, (CE_CONT, "mr_sas%d: "
+			    "enable bus-mastering", instance_no));
+		} else {
 			con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: "
-			    "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
-			    instance_no, vendor_id, device_id, subsysvid,
-			    subsysid, irq, MRSAS_VERSION));
+			    "bus-mastering already set", instance_no));
+		}
 
-			/* enable bus-mastering */
-			command = pci_config_get16(instance->pci_handle,
-			    PCI_CONF_COMM);
+		/* initialize function pointers */
+		switch (device_id) {
+		case PCI_DEVICE_ID_LSI_TBOLT:
+		case PCI_DEVICE_ID_LSI_INVADER:
+			con_log(CL_ANN, (CE_NOTE,
+			    "mr_sas: 2208 T.B. device detected"));
 
-			if (!(command & PCI_COMM_ME)) {
-				command |= PCI_COMM_ME;
+			instance->func_ptr =
+			    &mrsas_function_template_fusion;
+			instance->tbolt = 1;
+			break;
 
-				pci_config_put16(instance->pci_handle,
-				    PCI_CONF_COMM, command);
+		case PCI_DEVICE_ID_LSI_2108VDE:
+		case PCI_DEVICE_ID_LSI_2108V:
+			con_log(CL_ANN, (CE_NOTE,
+			    "mr_sas: 2108 Liberator device detected"));
 
-				con_log(CL_ANN, (CE_CONT, "mr_sas%d: "
-				    "enable bus-mastering", instance_no));
-			} else {
-				con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: "
-				"bus-mastering already set", instance_no));
-			}
+			instance->func_ptr =
+			    &mrsas_function_template_ppc;
+			break;
 
-			/* initialize function pointers */
-			if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
-			    (device_id == PCI_DEVICE_ID_LSI_2108V)) {
-				con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: "
-				    "2108V/DE detected", instance_no));
-				instance->func_ptr->read_fw_status_reg =
-				    read_fw_status_reg_ppc;
-				instance->func_ptr->issue_cmd = issue_cmd_ppc;
-				instance->func_ptr->issue_cmd_in_sync_mode =
-				    issue_cmd_in_sync_mode_ppc;
-				instance->func_ptr->issue_cmd_in_poll_mode =
-				    issue_cmd_in_poll_mode_ppc;
-				instance->func_ptr->enable_intr =
-				    enable_intr_ppc;
-				instance->func_ptr->disable_intr =
-				    disable_intr_ppc;
-				instance->func_ptr->intr_ack = intr_ack_ppc;
-			} else {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: Invalid device detected"));
+		default:
+			cmn_err(CE_WARN,
+			    "mr_sas: Invalid device detected");
 
-				pci_config_teardown(&instance->pci_handle);
-				kmem_free(instance->func_ptr,
-				    sizeof (struct mrsas_func_ptr));
-				ddi_soft_state_free(mrsas_state, instance_no);
+			pci_config_teardown(&instance->pci_handle);
+			ddi_soft_state_free(mrsas_state, instance_no);
+			return (DDI_FAILURE);
+		}
 
-				return (DDI_FAILURE);
+		instance->baseaddress = pci_config_get32(
+		    instance->pci_handle, PCI_CONF_BASE0);
+		instance->baseaddress &= 0x0fffc;
+
+		instance->dip		= dip;
+		instance->vendor_id	= vendor_id;
+		instance->device_id	= device_id;
+		instance->subsysvid	= subsysvid;
+		instance->subsysid	= subsysid;
+		instance->instance	= instance_no;
+
+		/* Initialize FMA */
+		instance->fm_capabilities = ddi_prop_get_int(
+		    DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+		    "fm-capable", DDI_FM_EREPORT_CAPABLE |
+		    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+		    | DDI_FM_ERRCB_CAPABLE);
+
+		mrsas_fm_init(instance);
+
+		/* Setup register map */
+		if ((ddi_dev_regsize(instance->dip,
+		    REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+		    reglength < MINIMUM_MFI_MEM_SZ) {
+			goto fail_attach;
+		}
+		if (reglength > DEFAULT_MFI_MEM_SZ) {
+			reglength = DEFAULT_MFI_MEM_SZ;
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "mr_sas: register length to map is 0x%lx bytes",
+			    reglength));
+		}
+		if (ddi_regs_map_setup(instance->dip,
+		    REGISTER_SET_IO_2108, &instance->regmap, 0,
+		    reglength, &endian_attr, &instance->regmap_handle)
+		    != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "mr_sas: couldn't map control registers");
+			goto fail_attach;
+		}
+		if (instance->regmap_handle == NULL) {
+			cmn_err(CE_WARN,
+			    "mr_sas: couldn't map control registers");
+			goto fail_attach;
+		}
+
+		instance->unroll.regs = 1;
+
+		/*
+		 * Disable Interrupt Now.
+		 * Setup Software interrupt
+		 */
+		instance->func_ptr->disable_intr(instance);
+
+		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+		    "mrsas-enable-msi", &data) == DDI_SUCCESS) {
+			if (strncmp(data, "no", 3) == 0) {
+				msi_enable = 0;
+				con_log(CL_ANN1, (CE_WARN,
+				    "msi_enable = %d disabled", msi_enable));
 			}
+			ddi_prop_free(data);
+		}
 
-			instance->baseaddress = pci_config_get32(
-			    instance->pci_handle, PCI_CONF_BASE0);
-			instance->baseaddress &= 0x0fffc;
-
-			instance->dip		= dip;
-			instance->vendor_id	= vendor_id;
-			instance->device_id	= device_id;
-			instance->subsysvid	= subsysvid;
-			instance->subsysid	= subsysid;
-			instance->instance	= instance_no;
-
-			/* Initialize FMA */
-			instance->fm_capabilities = ddi_prop_get_int(
-			    DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
-			    "fm-capable", DDI_FM_EREPORT_CAPABLE |
-			    DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
-			    | DDI_FM_ERRCB_CAPABLE);
-
-			mrsas_fm_init(instance);
-
-			/* Initialize Interrupts */
-			if ((ddi_dev_regsize(instance->dip,
-			    REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
-			    reglength < MINIMUM_MFI_MEM_SZ) {
-				return (DDI_FAILURE);
+		con_log(CL_DLEVEL1, (CE_NOTE, "msi_enable = %d", msi_enable));
+
+		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+		    "mrsas-enable-fp", &data) == DDI_SUCCESS) {
+			if (strncmp(data, "no", 3) == 0) {
+				enable_fp = 0;
+				cmn_err(CE_NOTE,
+				    "enable_fp = %d, Fast-Path disabled.\n",
+				    enable_fp);
+			}
+
+			ddi_prop_free(data);
+		}
+
+		cmn_err(CE_NOTE, "enable_fp = %d\n", enable_fp);
+
+		/* Check for all supported interrupt types */
+		if (ddi_intr_get_supported_types(
+		    dip, &intr_types) != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "ddi_intr_get_supported_types() failed");
+			goto fail_attach;
+		}
+
+		con_log(CL_DLEVEL1, (CE_NOTE,
+		    "ddi_intr_get_supported_types() ret: 0x%x", intr_types));
+
+		/* Initialize and Setup Interrupt handler */
+		if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+			if (mrsas_add_intrs(instance, DDI_INTR_TYPE_MSIX) !=
+			    DDI_SUCCESS) {
+				cmn_err(CE_WARN,
+				    "MSIX interrupt query failed");
+				goto fail_attach;
 			}
-			if (reglength > DEFAULT_MFI_MEM_SZ) {
-				reglength = DEFAULT_MFI_MEM_SZ;
-				con_log(CL_DLEVEL1, (CE_NOTE,
-				    "mr_sas: register length to map is "
-				    "0x%lx bytes", reglength));
+			instance->intr_type = DDI_INTR_TYPE_MSIX;
+		} else if (msi_enable && (intr_types & DDI_INTR_TYPE_MSI)) {
+			if (mrsas_add_intrs(instance, DDI_INTR_TYPE_MSI) !=
+			    DDI_SUCCESS) {
+				cmn_err(CE_WARN,
+				    "MSI interrupt query failed");
+				goto fail_attach;
 			}
-			if (ddi_regs_map_setup(instance->dip,
-			    REGISTER_SET_IO_2108, &instance->regmap, 0,
-			    reglength, &endian_attr, &instance->regmap_handle)
-			    != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_NOTE,
-				    "mr_sas: couldn't map control registers"));
+			instance->intr_type = DDI_INTR_TYPE_MSI;
+		} else if (intr_types & DDI_INTR_TYPE_FIXED) {
+			msi_enable = 0;
+			if (mrsas_add_intrs(instance, DDI_INTR_TYPE_FIXED) !=
+			    DDI_SUCCESS) {
+				cmn_err(CE_WARN,
+				    "FIXED interrupt query failed");
 				goto fail_attach;
 			}
+			instance->intr_type = DDI_INTR_TYPE_FIXED;
+		} else {
+			cmn_err(CE_WARN, "Device cannot "
+			    "suppport either FIXED or MSI/X "
+			    "interrupts");
+			goto fail_attach;
+		}
 
-			/*
-			 * Disable Interrupt Now.
-			 * Setup Software interrupt
-			 */
-			instance->func_ptr->disable_intr(instance);
+		instance->unroll.intr = 1;
 
-			if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
-			    "mrsas-enable-msi", &data) == DDI_SUCCESS) {
-				if (strncmp(data, "no", 3) == 0) {
-					msi_enable = 0;
-					con_log(CL_ANN1, (CE_WARN,
-					    "msi_enable = %d disabled",
-					    msi_enable));
-				}
-				ddi_prop_free(data);
+		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+		    "mrsas-enable-ctio", &data) == DDI_SUCCESS) {
+			if (strncmp(data, "no", 3) == 0) {
+				ctio_enable = 0;
+				con_log(CL_ANN1, (CE_WARN,
+				    "ctio_enable = %d disabled", ctio_enable));
 			}
+			ddi_prop_free(data);
+		}
 
-			con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
-			    msi_enable));
+		con_log(CL_DLEVEL1, (CE_WARN, "ctio_enable = %d", ctio_enable));
 
-			/* Check for all supported interrupt types */
-			if (ddi_intr_get_supported_types(
-			    dip, &intr_types) != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN,
-				    "ddi_intr_get_supported_types() failed"));
-				goto fail_attach;
-			}
+		/* setup the mfi based low level driver */
+		if (mrsas_init_adapter(instance) != DDI_SUCCESS) {
+			cmn_err(CE_WARN, "mr_sas: "
+			    "could not initialize the low level driver");
 
-			con_log(CL_DLEVEL1, (CE_NOTE,
-			    "ddi_intr_get_supported_types() ret: 0x%x",
-			    intr_types));
+			goto fail_attach;
+		}
 
-			/* Initialize and Setup Interrupt handler */
-			if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
-				if (mrsas_add_intrs(instance,
-				    DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
-					con_log(CL_ANN, (CE_WARN,
-					    "MSIX interrupt query failed"));
-					goto fail_attach;
-				}
-				instance->intr_type = DDI_INTR_TYPE_MSIX;
-			} else if (msi_enable && (intr_types &
-			    DDI_INTR_TYPE_MSI)) {
-				if (mrsas_add_intrs(instance,
-				    DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
-					con_log(CL_ANN, (CE_WARN,
-					    "MSI interrupt query failed"));
-					goto fail_attach;
-				}
-				instance->intr_type = DDI_INTR_TYPE_MSI;
-			} else if (intr_types & DDI_INTR_TYPE_FIXED) {
-				msi_enable = 0;
-				if (mrsas_add_intrs(instance,
-				    DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
-					con_log(CL_ANN, (CE_WARN,
-					    "FIXED interrupt query failed"));
-					goto fail_attach;
-				}
-				instance->intr_type = DDI_INTR_TYPE_FIXED;
-			} else {
-				con_log(CL_ANN, (CE_WARN, "Device cannot "
-				    "suppport either FIXED or MSI/X "
-				    "interrupts"));
-				goto fail_attach;
-			}
+		/* Initialize all Mutex */
+		INIT_LIST_HEAD(&instance->completed_pool_list);
+		mutex_init(&instance->completed_pool_mtx,
+		    "completed_pool_mtx", MUTEX_DRIVER,
+		    DDI_INTR_PRI(instance->intr_pri));
 
-			added_isr_f = 1;
+		mutex_init(&instance->sync_map_mtx,
+		    "sync_map_mtx", MUTEX_DRIVER,
+		    DDI_INTR_PRI(instance->intr_pri));
 
-			if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
-			    "mrsas-enable-ctio", &data) == DDI_SUCCESS) {
-				if (strncmp(data, "no", 3) == 0) {
-					ctio_enable = 0;
-					con_log(CL_ANN1, (CE_WARN,
-					    "ctio_enable = %d disabled",
-					    ctio_enable));
-				}
-				ddi_prop_free(data);
-			}
+		mutex_init(&instance->app_cmd_pool_mtx,
+		    "app_cmd_pool_mtx",	MUTEX_DRIVER,
+		    DDI_INTR_PRI(instance->intr_pri));
 
-			con_log(CL_DLEVEL1, (CE_WARN, "ctio_enable = %d",
-			    ctio_enable));
+		mutex_init(&instance->config_dev_mtx, "config_dev_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
 
-			/* setup the mfi based low level driver */
-			if (init_mfi(instance) != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN, "mr_sas: "
-				"could not initialize the low level driver"));
+		mutex_init(&instance->cmd_pend_mtx, "cmd_pend_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
 
-				goto fail_attach;
-			}
+		mutex_init(&instance->ocr_flags_mtx, "ocr_flags_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
 
-			/* Initialize all Mutex */
-			INIT_LIST_HEAD(&instance->completed_pool_list);
-			mutex_init(&instance->completed_pool_mtx,
-			    "completed_pool_mtx", MUTEX_DRIVER,
-			    DDI_INTR_PRI(instance->intr_pri));
+		mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+		cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
 
-			mutex_init(&instance->app_cmd_pool_mtx,
-			    "app_cmd_pool_mtx",	MUTEX_DRIVER,
-			    DDI_INTR_PRI(instance->intr_pri));
+		mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+		mutex_init(&instance->reg_write_mtx, "reg_write_mtx",
+		    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
 
-			mutex_init(&instance->cmd_pend_mtx, "cmd_pend_mtx",
-			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+		if (instance->tbolt) {
+			mutex_init(&instance->cmd_app_pool_mtx,
+			    "cmd_app_pool_mtx", MUTEX_DRIVER,
+			    DDI_INTR_PRI(instance->intr_pri));
 
-			mutex_init(&instance->ocr_flags_mtx, "ocr_flags_mtx",
-			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+			mutex_init(&instance->chip_mtx,
+			    "chip_mtx", MUTEX_DRIVER,
+			    DDI_INTR_PRI(instance->intr_pri));
 
-			mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
-			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
-			cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+		}
 
-			mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
-			    MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+		instance->unroll.mutexs = 1;
 
-			instance->timeout_id = (timeout_id_t)-1;
+		instance->timeout_id = (timeout_id_t)-1;
 
-			/* Register our soft-isr for highlevel interrupts. */
-			instance->isr_level = instance->intr_pri;
+		/* Register our soft-isr for highlevel interrupts. */
+		instance->isr_level = instance->intr_pri;
+		if (!(instance->tbolt)) {
 			if (instance->isr_level == HIGH_LEVEL_INTR) {
-				if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+				if (ddi_add_softintr(dip,
+				    DDI_SOFTINT_HIGH,
 				    &instance->soft_intr_id, NULL, NULL,
 				    mrsas_softintr, (caddr_t)instance) !=
 				    DDI_SUCCESS) {
-					con_log(CL_ANN, (CE_WARN,
-					    " Software ISR did not register"));
+					cmn_err(CE_WARN,
+					    "Software ISR did not register");
 
 					goto fail_attach;
 				}
 
-				added_soft_isr_f = 1;
-			}
-
-			/* Allocate a transport structure */
-			tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+				instance->unroll.soft_isr = 1;
 
-			if (tran == NULL) {
-				con_log(CL_ANN, (CE_WARN,
-				    "scsi_hba_tran_alloc failed"));
-				goto fail_attach;
 			}
+		}
+
+		instance->softint_running = 0;
 
-			tran_alloc_f = 1;
+		/* Allocate a transport structure */
+		tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
 
-			instance->tran = tran;
+		if (tran == NULL) {
+			cmn_err(CE_WARN,
+			    "scsi_hba_tran_alloc failed");
+			goto fail_attach;
+		}
 
-			tran->tran_hba_private	= instance;
-			tran->tran_tgt_init	= mrsas_tran_tgt_init;
-			tran->tran_tgt_probe	= scsi_hba_probe;
-			tran->tran_tgt_free	= mrsas_tran_tgt_free;
+		instance->tran = tran;
+		instance->unroll.tran = 1;
+
+		tran->tran_hba_private	= instance;
+		tran->tran_tgt_init	= mrsas_tran_tgt_init;
+		tran->tran_tgt_probe	= scsi_hba_probe;
+		tran->tran_tgt_free	= mrsas_tran_tgt_free;
+		if (instance->tbolt) {
+			tran->tran_init_pkt	=
+			    mrsas_tbolt_tran_init_pkt;
+			tran->tran_start	=
+			    mrsas_tbolt_tran_start;
+		} else {
 			tran->tran_init_pkt	= mrsas_tran_init_pkt;
 			tran->tran_start	= mrsas_tran_start;
-			tran->tran_abort	= mrsas_tran_abort;
-			tran->tran_reset	= mrsas_tran_reset;
-			tran->tran_getcap	= mrsas_tran_getcap;
-			tran->tran_setcap	= mrsas_tran_setcap;
-			tran->tran_destroy_pkt	= mrsas_tran_destroy_pkt;
-			tran->tran_dmafree	= mrsas_tran_dmafree;
-			tran->tran_sync_pkt	= mrsas_tran_sync_pkt;
-			tran->tran_bus_config	= mrsas_tran_bus_config;
-
-			if (mrsas_relaxed_ordering)
-				mrsas_generic_dma_attr.dma_attr_flags |=
-				    DDI_DMA_RELAXED_ORDERING;
-
-
-			tran_dma_attr = mrsas_generic_dma_attr;
-			tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
-
-			/* Attach this instance of the hba */
-			if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
-			    != DDI_SUCCESS) {
-				con_log(CL_ANN, (CE_WARN,
-				    "scsi_hba_attach failed"));
+		}
+		tran->tran_abort	= mrsas_tran_abort;
+		tran->tran_reset	= mrsas_tran_reset;
+		tran->tran_getcap	= mrsas_tran_getcap;
+		tran->tran_setcap	= mrsas_tran_setcap;
+		tran->tran_destroy_pkt	= mrsas_tran_destroy_pkt;
+		tran->tran_dmafree	= mrsas_tran_dmafree;
+		tran->tran_sync_pkt	= mrsas_tran_sync_pkt;
+		tran->tran_quiesce	= mrsas_tran_quiesce;
+		tran->tran_unquiesce	= mrsas_tran_unquiesce;
+		tran->tran_bus_config	= mrsas_tran_bus_config;
+
+		if (mrsas_relaxed_ordering)
+			mrsas_generic_dma_attr.dma_attr_flags |=
+			    DDI_DMA_RELAXED_ORDERING;
+
+
+		tran_dma_attr = mrsas_generic_dma_attr;
+		tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+		/* Attach this instance of the hba */
+		if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+		    != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "scsi_hba_attach failed");
+
+			goto fail_attach;
+		}
+		instance->unroll.tranSetup = 1;
+		con_log(CL_ANN1,
+		    (CE_CONT, "scsi_hba_attach_setup()	done."));
+
+		/* create devctl node for cfgadm command */
+		if (ddi_create_minor_node(dip, "devctl",
+		    S_IFCHR, INST2DEVCTL(instance_no),
+		    DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to create devctl node.");
+
+			goto fail_attach;
+		}
 
-				goto fail_attach;
-			}
+		instance->unroll.devctl = 1;
 
-			/* create devctl node for cfgadm command */
-			if (ddi_create_minor_node(dip, "devctl",
-			    S_IFCHR, INST2DEVCTL(instance_no),
-			    DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to create devctl node."));
+		/* create scsi node for cfgadm command */
+		if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+		    INST2SCSI(instance_no), DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+		    DDI_FAILURE) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to create scsi node.");
 
-				goto fail_attach;
-			}
+			goto fail_attach;
+		}
 
-			create_devctl_node_f = 1;
+		instance->unroll.scsictl = 1;
 
-			/* create scsi node for cfgadm command */
-			if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
-			    INST2SCSI(instance_no),
-			    DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
-			    DDI_FAILURE) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to create scsi node."));
+		(void) sprintf(instance->iocnode, "%d:lsirdctl",
+		    instance_no);
 
-				goto fail_attach;
-			}
+		/*
+		 * Create a node for applications
+		 * for issuing ioctl to the driver.
+		 */
+		if (ddi_create_minor_node(dip, instance->iocnode,
+		    S_IFCHR, INST2LSIRDCTL(instance_no), DDI_PSEUDO, 0) ==
+		    DDI_FAILURE) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to create ioctl node.");
 
-			create_scsi_node_f = 1;
+			goto fail_attach;
+		}
 
-			(void) sprintf(instance->iocnode, "%d:lsirdctl",
-			    instance_no);
+		instance->unroll.ioctl = 1;
 
-			/*
-			 * Create a node for applications
-			 * for issuing ioctl to the driver.
-			 */
-			if (ddi_create_minor_node(dip, instance->iocnode,
-			    S_IFCHR, INST2LSIRDCTL(instance_no),
-			    DDI_PSEUDO, 0) == DDI_FAILURE) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to create ioctl node."));
+		/* Create a taskq to handle dr events */
+		if ((instance->taskq = ddi_taskq_create(dip,
+		    "mrsas_dr_taskq", 1, TASKQ_DEFAULTPRI, 0)) == NULL) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to create taskq ");
+			instance->taskq = NULL;
+			goto fail_attach;
+		}
+		instance->unroll.taskq = 1;
+		con_log(CL_ANN1, (CE_CONT, "ddi_taskq_create()	done."));
 
-				goto fail_attach;
-			}
+		/* enable interrupt */
+		instance->func_ptr->enable_intr(instance);
 
-			create_ioc_node_f = 1;
+		/* initiate AEN */
+		if (start_mfi_aen(instance)) {
+			cmn_err(CE_WARN,
+			    "mr_sas: failed to initiate AEN.");
+			goto fail_attach;
+		}
+		instance->unroll.aenPend = 1;
+		con_log(CL_ANN1,
+		    (CE_CONT, "AEN started for instance %d.", instance_no));
 
-			/* Create a taskq to handle dr events */
-			if ((instance->taskq = ddi_taskq_create(dip,
-			    "mrsas_dr_taskq", 1,
-			    TASKQ_DEFAULTPRI, 0)) == NULL) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to create taskq "));
-				instance->taskq = NULL;
-				goto fail_attach;
-			}
+		/* Finally! We are on the air.	*/
+		ddi_report_dev(dip);
 
-			/* enable interrupt */
-			instance->func_ptr->enable_intr(instance);
+		/* FMA handle checking. */
+		if (mrsas_check_acc_handle(instance->regmap_handle) !=
+		    DDI_SUCCESS) {
+			goto fail_attach;
+		}
+		if (mrsas_check_acc_handle(instance->pci_handle) !=
+		    DDI_SUCCESS) {
+			goto fail_attach;
+		}
 
-			/* initiate AEN */
-			if (start_mfi_aen(instance)) {
-				con_log(CL_ANN, (CE_WARN,
-				    "mr_sas: failed to initiate AEN."));
-				goto fail_initiate_aen;
+		instance->mr_ld_list =
+		    kmem_zalloc(MRDRV_MAX_LD * sizeof (struct mrsas_ld),
+		    KM_SLEEP);
+		if (instance->mr_ld_list == NULL) {
+			cmn_err(CE_WARN, "mr_sas attach(): "
+			    "failed to allocate ld_list array");
+			goto fail_attach;
+		}
+		instance->unroll.ldlist_buff = 1;
+
+#ifdef PDSUPPORT
+		if (instance->tbolt) {
+			instance->mr_tbolt_pd_max = MRSAS_TBOLT_PD_TGT_MAX;
+			instance->mr_tbolt_pd_list =
+			    kmem_zalloc(MRSAS_TBOLT_GET_PD_MAX(instance) *
+			    sizeof (struct mrsas_tbolt_pd), KM_SLEEP);
+			ASSERT(instance->mr_tbolt_pd_list);
+			for (i = 0; i < instance->mr_tbolt_pd_max; i++) {
+				instance->mr_tbolt_pd_list[i].lun_type =
+				    MRSAS_TBOLT_PD_LUN;
+				instance->mr_tbolt_pd_list[i].dev_id =
+				    (uint8_t)i;
 			}
 
-			con_log(CL_DLEVEL1, (CE_NOTE,
-			    "AEN started for instance %d.", instance_no));
-
-			/* Finally! We are on the air.  */
-			ddi_report_dev(dip);
-
-			if (mrsas_check_acc_handle(instance->regmap_handle) !=
-			    DDI_SUCCESS) {
-				goto fail_attach;
-			}
-			if (mrsas_check_acc_handle(instance->pci_handle) !=
-			    DDI_SUCCESS) {
-				goto fail_attach;
-			}
-			instance->mr_ld_list =
-			    kmem_zalloc(MRDRV_MAX_LD * sizeof (struct mrsas_ld),
-			    KM_SLEEP);
-			break;
-		case DDI_PM_RESUME:
-			con_log(CL_ANN, (CE_NOTE,
-			    "mr_sas: DDI_PM_RESUME"));
-			break;
-		case DDI_RESUME:
-			con_log(CL_ANN, (CE_NOTE,
-			    "mr_sas: DDI_RESUME"));
-			break;
-		default:
-			con_log(CL_ANN, (CE_WARN,
-			    "mr_sas: invalid attach cmd=%x", cmd));
-			return (DDI_FAILURE);
+			instance->unroll.pdlist_buff = 1;
+		}
+#endif
+		break;
+	case DDI_PM_RESUME:
+		con_log(CL_ANN, (CE_NOTE, "mr_sas: DDI_PM_RESUME"));
+		break;
+	case DDI_RESUME:
+		con_log(CL_ANN, (CE_NOTE, "mr_sas: DDI_RESUME"));
+		break;
+	default:
+		con_log(CL_ANN,
+		    (CE_WARN, "mr_sas: invalid attach cmd=%x", cmd));
+		return (DDI_FAILURE);
 	}
 
+
+	cmn_err(CE_NOTE, "mrsas_attach() return SUCCESS instance_num %d",
+	    instance_no);
 	return (DDI_SUCCESS);
 
-fail_initiate_aen:
 fail_attach:
-	if (create_devctl_node_f) {
-		ddi_remove_minor_node(dip, "devctl");
-	}
-
-	if (create_scsi_node_f) {
-		ddi_remove_minor_node(dip, "scsi");
-	}
-
-	if (create_ioc_node_f) {
-		ddi_remove_minor_node(dip, instance->iocnode);
-	}
-
-	if (tran_alloc_f) {
-		scsi_hba_tran_free(tran);
-	}
-
-
-	if (added_soft_isr_f) {
-		ddi_remove_softintr(instance->soft_intr_id);
-	}
-
-	if (added_isr_f) {
-		mrsas_rem_intrs(instance);
-	}
 
-	if (instance && instance->taskq) {
-		ddi_taskq_destroy(instance->taskq);
-	}
+	mrsas_undo_resources(dip, instance);
 
 	mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
 	ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
@@ -780,15 +1020,30 @@ fail_attach:
 	mrsas_fm_fini(instance);
 
 	pci_config_teardown(&instance->pci_handle);
-
 	ddi_soft_state_free(mrsas_state, instance_no);
 
-	con_log(CL_ANN, (CE_NOTE,
-	    "mr_sas: return failure from mrsas_attach"));
+	con_log(CL_ANN, (CE_WARN, "mr_sas: return failure from mrsas_attach"));
+
+	cmn_err(CE_WARN, "mrsas_attach() return FAILURE instance_num %d",
+	    instance_no);
 
 	return (DDI_FAILURE);
 }
 
+/*
+ * getinfo - gets device information
+ * @dip:
+ * @cmd:
+ * @arg:
+ * @resultp:
+ *
+ * The system calls getinfo() to obtain configuration information that only
+ * the driver knows. The mapping of minor numbers to device instance is
+ * entirely under the control of the driver. The system sometimes needs to ask
+ * the driver which device a particular dev_t represents.
+ * Given the device number return the devinfo pointer from the scsi_device
+ * structure.
+ */
 /*ARGSUSED*/
 static int
 mrsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
@@ -827,6 +1082,19 @@ mrsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
 	return (rval);
 }
 
+/*
+ * detach - detaches a device from the system
+ * @dip: pointer to the device's dev_info structure
+ * @cmd: type of detach
+ *
+ * A driver's detach() entry point is called to detach an instance of a device
+ * that is bound to the driver. The entry point is called with the instance of
+ * the device node to be detached and with DDI_DETACH, which is specified as
+ * the cmd argument to the entry point.
+ * This routine is called during driver unload. We free all the allocated
+ * resources and call the corresponding LLD so that it can also release all
+ * its resources.
+ */
 static int
 mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
@@ -834,7 +1102,8 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 
 	struct mrsas_instance	*instance;
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
 
 	/* CONSTCOND */
 	ASSERT(NO_COMPETING_THREADS);
@@ -845,9 +1114,9 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	    instance_no);
 
 	if (!instance) {
-		con_log(CL_ANN, (CE_WARN,
+		cmn_err(CE_WARN,
 		    "mr_sas:%d could not get instance in detach",
-		    instance_no));
+		    instance_no);
 
 		return (DDI_FAILURE);
 	}
@@ -858,84 +1127,253 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	    instance->subsysvid, instance->subsysid));
 
 	switch (cmd) {
-	case DDI_DETACH:
-		con_log(CL_ANN, (CE_NOTE,
-		    "mrsas_detach: DDI_DETACH"));
+		case DDI_DETACH:
+			con_log(CL_ANN, (CE_NOTE,
+			    "mrsas_detach: DDI_DETACH"));
+
+			mutex_enter(&instance->config_dev_mtx);
+			if (instance->timeout_id != (timeout_id_t)-1) {
+				mutex_exit(&instance->config_dev_mtx);
+				(void) untimeout(instance->timeout_id);
+				instance->timeout_id = (timeout_id_t)-1;
+				mutex_enter(&instance->config_dev_mtx);
+				instance->unroll.timer = 0;
+			}
+			mutex_exit(&instance->config_dev_mtx);
 
-		if (scsi_hba_detach(dip) != DDI_SUCCESS) {
-			con_log(CL_ANN, (CE_WARN,
-			    "mr_sas:%d failed to detach",
-			    instance_no));
+			if (instance->unroll.tranSetup == 1) {
+				if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+					cmn_err(CE_WARN,
+					    "mr_sas2%d: failed to detach",
+					    instance_no);
+					return (DDI_FAILURE);
+				}
+				instance->unroll.tranSetup = 0;
+				con_log(CL_ANN1,
+				    (CE_CONT, "scsi_hba_dettach()  done."));
+			}
+
+			flush_cache(instance);
+
+			mrsas_undo_resources(dip, instance);
+
+			mrsas_fm_fini(instance);
+
+			pci_config_teardown(&instance->pci_handle);
+			ddi_soft_state_free(mrsas_state, instance_no);
+			break;
 
+		case DDI_PM_SUSPEND:
+			con_log(CL_ANN, (CE_NOTE,
+			    "mrsas_detach: DDI_PM_SUSPEND"));
+
+			break;
+		case DDI_SUSPEND:
+			con_log(CL_ANN, (CE_NOTE,
+			    "mrsas_detach: DDI_SUSPEND"));
+
+			break;
+		default:
+			con_log(CL_ANN, (CE_WARN,
+			    "invalid detach command:0x%x", cmd));
 			return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+
+static void
+mrsas_undo_resources(dev_info_t *dip, struct mrsas_instance *instance)
+{
+	int	instance_no;
+
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+
+	instance_no = ddi_get_instance(dip);
+
+
+	if (instance->unroll.ioctl == 1)  {
+		ddi_remove_minor_node(dip, instance->iocnode);
+		instance->unroll.ioctl = 0;
+	}
+
+	if (instance->unroll.scsictl == 1)  {
+		ddi_remove_minor_node(dip, "scsi");
+		instance->unroll.scsictl = 0;
+	}
+
+	if (instance->unroll.devctl == 1)  {
+		ddi_remove_minor_node(dip, "devctl");
+		instance->unroll.devctl = 0;
+	}
+
+	if (instance->unroll.tranSetup == 1)  {
+		if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "mr_sas2%d: failed to detach", instance_no);
+			return;	 /* DDI_FAILURE */
 		}
+		instance->unroll.tranSetup = 0;
+		con_log(CL_ANN1, (CE_CONT, "scsi_hba_dettach()	done."));
+	}
 
+	if (instance->unroll.tran == 1)	 {
 		scsi_hba_tran_free(instance->tran);
+		instance->unroll.tran = 0;
+		con_log(CL_ANN1, (CE_CONT, "scsi_hba_tran_free()  done."));
+	}
 
-		flush_cache(instance);
+	if (instance->unroll.syncCmd == 1)  {
+		if (instance->tbolt) {
+			if (abort_syncmap_cmd(instance,
+			    instance->map_update_cmd)) {
+				cmn_err(CE_WARN, "mrsas_detach: "
+				    "failed to abort previous syncmap command");
+			}
 
-		if (abort_aen_cmd(instance, instance->aen_cmd)) {
-			con_log(CL_ANN, (CE_WARN, "mrsas_detach: "
-			    "failed to abort prevous AEN command"));
+			instance->unroll.syncCmd = 0;
+			con_log(CL_ANN1, (CE_CONT, "sync cmd aborted, done."));
+		}
+	}
 
-			return (DDI_FAILURE);
+	if (instance->unroll.aenPend == 1)  {
+		if (abort_aen_cmd(instance, instance->aen_cmd))
+			cmn_err(CE_WARN, "mrsas_detach: "
+			    "failed to abort prevous AEN command");
+
+		instance->unroll.aenPend = 0;
+		con_log(CL_ANN1, (CE_CONT, "aen cmd aborted, done."));
+		/* This means the controller is fully initialzed and running */
+		/* Shutdown should be a last command to controller. */
+		/* shutdown_controller(); */
+	}
+
+
+	if (instance->unroll.timer == 1)	 {
+		if (instance->timeout_id != (timeout_id_t)-1)  {
+			(void) untimeout(instance->timeout_id);
+			instance->timeout_id = (timeout_id_t)-1;
+
+			instance->unroll.timer = 0;
 		}
+	}
 
-		instance->func_ptr->disable_intr(instance);
+	instance->func_ptr->disable_intr(instance);
 
-		if (instance->isr_level == HIGH_LEVEL_INTR) {
-			ddi_remove_softintr(instance->soft_intr_id);
+
+	if (instance->unroll.mutexs == 1)  {
+		mutex_destroy(&instance->cmd_pool_mtx);
+		mutex_destroy(&instance->app_cmd_pool_mtx);
+		mutex_destroy(&instance->cmd_pend_mtx);
+		mutex_destroy(&instance->completed_pool_mtx);
+		mutex_destroy(&instance->sync_map_mtx);
+		mutex_destroy(&instance->int_cmd_mtx);
+		cv_destroy(&instance->int_cmd_cv);
+		mutex_destroy(&instance->config_dev_mtx);
+		mutex_destroy(&instance->ocr_flags_mtx);
+		mutex_destroy(&instance->reg_write_mtx);
+
+		if (instance->tbolt) {
+			mutex_destroy(&instance->cmd_app_pool_mtx);
+			mutex_destroy(&instance->chip_mtx);
 		}
 
+		instance->unroll.mutexs = 0;
+		con_log(CL_ANN1, (CE_CONT, "Destroy mutex & cv,	 done."));
+	}
+
+
+	if (instance->unroll.soft_isr == 1) {
+		ddi_remove_softintr(instance->soft_intr_id);
+		instance->unroll.soft_isr = 0;
+	}
+
+	if (instance->unroll.intr == 1) {
 		mrsas_rem_intrs(instance);
+		instance->unroll.intr = 0;
+	}
+
 
+	if (instance->unroll.taskq == 1)	 {
 		if (instance->taskq) {
 			ddi_taskq_destroy(instance->taskq);
+			instance->unroll.taskq = 0;
 		}
-		kmem_free(instance->mr_ld_list, MRDRV_MAX_LD
-		    * sizeof (struct mrsas_ld));
-		free_space_for_mfi(instance);
 
-		mrsas_fm_fini(instance);
+	}
+
+	/*
+	 * free dma memory allocated for
+	 * cmds/frames/queues/driver version etc
+	 */
+	if (instance->unroll.verBuff == 1) {
+		(void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj);
+		instance->unroll.verBuff = 0;
+	}
 
-		pci_config_teardown(&instance->pci_handle);
+	if (instance->unroll.pdlist_buff == 1)	{
+		if (instance->mr_tbolt_pd_list != NULL) {
+			kmem_free(instance->mr_tbolt_pd_list,
+			    MRSAS_TBOLT_GET_PD_MAX(instance) *
+			    sizeof (struct mrsas_tbolt_pd));
+		}
 
-		kmem_free(instance->func_ptr,
-		    sizeof (struct mrsas_func_ptr));
+		instance->mr_tbolt_pd_list = NULL;
+		instance->unroll.pdlist_buff = 0;
+	}
 
-		if (instance->timeout_id != (timeout_id_t)-1) {
-			(void) untimeout(instance->timeout_id);
-			instance->timeout_id = (timeout_id_t)-1;
+	if (instance->unroll.ldlist_buff == 1)	{
+		if (instance->mr_ld_list != NULL) {
+			kmem_free(instance->mr_ld_list, MRDRV_MAX_LD
+			    * sizeof (struct mrsas_ld));
 		}
-		ddi_soft_state_free(mrsas_state, instance_no);
-		break;
-	case DDI_PM_SUSPEND:
-		con_log(CL_ANN, (CE_NOTE,
-		    "mrsas_detach: DDI_PM_SUSPEND"));
 
-		break;
-	case DDI_SUSPEND:
-		con_log(CL_ANN, (CE_NOTE,
-		    "mrsas_detach: DDI_SUSPEND"));
+		instance->mr_ld_list = NULL;
+		instance->unroll.ldlist_buff = 0;
+	}
 
-		break;
-	default:
-		con_log(CL_ANN, (CE_WARN,
-		    "invalid detach command:0x%x", cmd));
-		return (DDI_FAILURE);
+	if (instance->tbolt) {
+		if (instance->unroll.alloc_space_mpi2 == 1) {
+			free_space_for_mpi2(instance);
+			instance->unroll.alloc_space_mpi2 = 0;
+		}
+	} else {
+		if (instance->unroll.alloc_space_mfi == 1) {
+			free_space_for_mfi(instance);
+			instance->unroll.alloc_space_mfi = 0;
+		}
 	}
 
-	return (DDI_SUCCESS);
+	if (instance->unroll.regs == 1)	 {
+		ddi_regs_map_free(&instance->regmap_handle);
+		instance->unroll.regs = 0;
+		con_log(CL_ANN1, (CE_CONT, "ddi_regs_map_free()	 done."));
+	}
 }
 
+
+
 /*
  * ************************************************************************** *
- *                                                                            *
- *             common entry points - for character driver types               *
- *                                                                            *
+ *									      *
+ *	       common entry points - for character driver types		      *
+ *									      *
  * ************************************************************************** *
  */
-static  int
+/*
+ * open - gets access to a device
+ * @dev:
+ * @openflags:
+ * @otyp:
+ * @credp:
+ *
+ * Access to a device by one or more application programs is controlled
+ * through the open() and close() entry points. The primary function of
+ * open() is to verify that the open request is allowed.
+ */
+static	int
 mrsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
 {
 	int	rval = 0;
@@ -968,7 +1406,17 @@ mrsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
 	return (rval);
 }
 
-static  int
+/*
+ * close - gives up access to a device
+ * @dev:
+ * @openflags:
+ * @otyp:
+ * @credp:
+ *
+ * close() should perform any cleanup necessary to finish using the minor
+ * device, and prepare the device (and driver) to be opened again.
+ */
+static	int
 mrsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
 {
 	int	rval = 0;
@@ -984,6 +1432,23 @@ mrsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
 	return (rval);
 }
 
+/*
+ * ioctl - performs a range of I/O commands for character drivers
+ * @dev:
+ * @cmd:
+ * @arg:
+ * @mode:
+ * @credp:
+ * @rvalp:
+ *
+ * ioctl() routine must make sure that user data is copied into or out of the
+ * kernel address space explicitly using copyin(), copyout(), ddi_copyin(),
+ * and ddi_copyout(), as appropriate.
+ * This is a wrapper routine to serialize access to the actual ioctl routine.
+ * ioctl() should return 0 on success, or the appropriate error number. The
+ * driver may also set the value returned to the calling process through rvalp.
+ */
+
 static int
 mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
     int *rvalp)
@@ -1005,7 +1470,12 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 
 	ioctl = (struct mrsas_ioctl *)kmem_zalloc(sizeof (struct mrsas_ioctl),
 	    KM_SLEEP);
-	ASSERT(ioctl);
+	if (ioctl == NULL) {
+		/* Failed to allocate memory for ioctl	*/
+		con_log(CL_ANN, (CE_WARN, "mr_sas_ioctl: "
+		    "failed to allocate memory for ioctl"));
+		return (ENXIO);
+	}
 
 	switch ((uint_t)cmd) {
 		case MRSAS_IOCTL_FIRMWARE:
@@ -1032,6 +1502,9 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 
 			break;
 		case MRSAS_IOCTL_AEN:
+			con_log(CL_ANN,
+			    (CE_NOTE, "mrsas_ioctl: IOCTL Register AEN.\n"));
+
 			if (ddi_copyin((void *) arg, &aen,
 			    sizeof (struct mrsas_aen), mode)) {
 				con_log(CL_ANN, (CE_WARN,
@@ -1064,12 +1537,19 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
 
 /*
  * ************************************************************************** *
- *                                                                            *
- *               common entry points - for block driver types                 *
- *                                                                            *
+ *									      *
+ *		 common entry points - for block driver types		      *
+ *									      *
  * ************************************************************************** *
  */
-#ifdef	__sparc
+#ifdef __sparc
+/*
+ * reset - TBD
+ * @dip:
+ * @cmd:
+ *
+ * TBD
+ */
 /*ARGSUSED*/
 static int
 mrsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
@@ -1092,7 +1572,7 @@ mrsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
 
 	instance->func_ptr->disable_intr(instance);
 
-	con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+	con_log(CL_ANN1, (CE_CONT, "flushing cache for instance %d",
 	    instance_no));
 
 	flush_cache(instance);
@@ -1130,14 +1610,26 @@ mrsas_quiesce(dev_info_t *dip)
 		    "failed to abort prevous AEN command QUIESCE"));
 	}
 
+	if (instance->tbolt) {
+		if (abort_syncmap_cmd(instance,
+		    instance->map_update_cmd)) {
+			cmn_err(CE_WARN,
+			    "mrsas_detach: failed to abort "
+			    "previous syncmap command");
+			return (DDI_FAILURE);
+		}
+	}
+
 	instance->func_ptr->disable_intr(instance);
 
-	con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+	con_log(CL_ANN1, (CE_CONT, "flushing cache for instance %d",
 	    instance_no));
 
 	flush_cache(instance);
 
 	if (wait_for_outstanding(instance)) {
+		con_log(CL_ANN1,
+		    (CE_CONT, "wait_for_outstanding: return FAIL.\n"));
 		return (DDI_FAILURE);
 	}
 	return (DDI_SUCCESS);
@@ -1146,11 +1638,24 @@ mrsas_quiesce(dev_info_t *dip)
 
 /*
  * ************************************************************************** *
- *                                                                            *
- *                          entry points (SCSI HBA)                           *
- *                                                                            *
+ *									      *
+ *			    entry points (SCSI HBA)			      *
+ *									      *
  * ************************************************************************** *
  */
+/*
+ * tran_tgt_init - initialize a target device instance
+ * @hba_dip:
+ * @tgt_dip:
+ * @tran:
+ * @sd:
+ *
+ * The tran_tgt_init() entry point enables the HBA to allocate and initialize
+ * any per-target resources. tran_tgt_init() also enables the HBA to qualify
+ * the device's address as valid and supportable for that particular HBA.
+ * By returning DDI_FAILURE, the instance of the target driver for that device
+ * is not probed or attached.
+ */
 /*ARGSUSED*/
 static int
 mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
@@ -1159,32 +1664,61 @@ mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
 	struct mrsas_instance *instance;
 	uint16_t tgt = sd->sd_address.a_target;
 	uint8_t lun = sd->sd_address.a_lun;
+	dev_info_t *child = NULL;
 
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init target %d lun %d",
+	con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init target %d lun %d",
 	    tgt, lun));
 
 	instance = ADDR2MR(&sd->sd_address);
 
 	if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
-		(void) ndi_merge_node(tgt_dip, mrsas_name_node);
-		ddi_set_name_addr(tgt_dip, NULL);
-
-		con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init in "
-		    "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
-		    tgt, lun));
+		/*
+		 * If no persistent node exists, we don't allow .conf node
+		 * to be created.
+		 */
+		if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) {
+			con_log(CL_DLEVEL2,
+			    (CE_NOTE, "mrsas_tgt_init find child ="
+			    " %p t = %d l = %d", (void *)child, tgt, lun));
+			if (ndi_merge_node(tgt_dip, mrsas_name_node) !=
+			    DDI_SUCCESS)
+				/* Create this .conf node */
+				return (DDI_SUCCESS);
+		}
+		con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init in ndi_per "
+		    "DDI_FAILURE t = %d l = %d", tgt, lun));
 		return (DDI_FAILURE);
+
 	}
 
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init dev_dip %p tgt_dip %p",
+	con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init dev_dip %p tgt_dip %p",
 	    (void *)instance->mr_ld_list[tgt].dip, (void *)tgt_dip));
 
 	if (tgt < MRDRV_MAX_LD && lun == 0) {
 		if (instance->mr_ld_list[tgt].dip == NULL &&
 		    strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+			mutex_enter(&instance->config_dev_mtx);
 			instance->mr_ld_list[tgt].dip = tgt_dip;
 			instance->mr_ld_list[tgt].lun_type = MRSAS_LD_LUN;
+			instance->mr_ld_list[tgt].flag = MRDRV_TGT_VALID;
+			mutex_exit(&instance->config_dev_mtx);
+		}
+	}
+
+#ifdef PDSUPPORT
+	else if (instance->tbolt) {
+		if (instance->mr_tbolt_pd_list[tgt].dip == NULL) {
+			mutex_enter(&instance->config_dev_mtx);
+			instance->mr_tbolt_pd_list[tgt].dip = tgt_dip;
+			instance->mr_tbolt_pd_list[tgt].flag =
+			    MRDRV_TGT_VALID;
+			mutex_exit(&instance->config_dev_mtx);
+			con_log(CL_ANN1, (CE_NOTE, "mrsas_tran_tgt_init:"
+			    "t%xl%x", tgt, lun));
 		}
 	}
+#endif
+
 	return (DDI_SUCCESS);
 }
 
@@ -1199,16 +1733,29 @@ mrsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
 
 	instance = ADDR2MR(&sd->sd_address);
 
-	con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+	con_log(CL_DLEVEL2, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
 
 	if (tgt < MRDRV_MAX_LD && lun == 0) {
 		if (instance->mr_ld_list[tgt].dip == tgt_dip) {
+			mutex_enter(&instance->config_dev_mtx);
 			instance->mr_ld_list[tgt].dip = NULL;
+			mutex_exit(&instance->config_dev_mtx);
 		}
 	}
+
+#ifdef PDSUPPORT
+	else if (instance->tbolt) {
+		mutex_enter(&instance->config_dev_mtx);
+		instance->mr_tbolt_pd_list[tgt].dip = NULL;
+		mutex_exit(&instance->config_dev_mtx);
+		con_log(CL_ANN1, (CE_NOTE, "tgt_free: Setting dip = NULL"
+		    "for tgt:%x", tgt));
+	}
+#endif
+
 }
 
-static dev_info_t *
+dev_info_t *
 mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun)
 {
 	dev_info_t *child = NULL;
@@ -1219,6 +1766,11 @@ mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun)
 	for (child = ddi_get_child(instance->dip); child;
 	    child = ddi_get_next_sibling(child)) {
 
+		/* XXX KEBE ASKS - why was this added?! */
+		if (ndi_dev_is_persistent_node(child) == 0) {
+			continue;
+		}
+
 		if (mrsas_name_node(child, tmp, MAXNAMELEN) !=
 		    DDI_SUCCESS) {
 			continue;
@@ -1228,11 +1780,17 @@ mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun)
 			break;
 		}
 	}
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_find_child: return child = %p",
+	con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_find_child: return child = %p",
 	    (void *)child));
 	return (child);
 }
 
+/*
+ * mrsas_name_node -
+ * @dip:
+ * @name:
+ * @len:
+ */
 static int
 mrsas_name_node(dev_info_t *dip, char *name, int len)
 {
@@ -1240,14 +1798,14 @@ mrsas_name_node(dev_info_t *dip, char *name, int len)
 
 	tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 	    DDI_PROP_DONTPASS, "target", -1);
-	con_log(CL_ANN1, (CE_NOTE,
+	con_log(CL_DLEVEL2, (CE_NOTE,
 	    "mrsas_name_node: dip %p tgt %d", (void *)dip, tgt));
 	if (tgt == -1) {
 		return (DDI_FAILURE);
 	}
 	lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
 	    "lun", -1);
-	con_log(CL_ANN1,
+	con_log(CL_DLEVEL2,
 	    (CE_NOTE, "mrsas_name_node: tgt %d lun %d", tgt, lun));
 	if (lun == -1) {
 		return (DDI_FAILURE);
@@ -1256,6 +1814,26 @@ mrsas_name_node(dev_info_t *dip, char *name, int len)
 	return (DDI_SUCCESS);
 }
 
+/*
+ * tran_init_pkt - allocate & initialize a scsi_pkt structure
+ * @ap:
+ * @pkt:
+ * @bp:
+ * @cmdlen:
+ * @statuslen:
+ * @tgtlen:
+ * @flags:
+ * @callback:
+ *
+ * The tran_init_pkt() entry point allocates and initializes a scsi_pkt
+ * structure and DMA resources for a target driver request. The
+ * tran_init_pkt() entry point is called when the target driver calls the
+ * SCSA function scsi_init_pkt(). Each call of the tran_init_pkt() entry point
+ * is a request to perform one or more of three possible services:
+ *  - allocation and initialization of a scsi_pkt structure
+ *  - allocation of DMA resources for data transfer
+ *  - reallocation of DMA resources for the next portion of the data transfer
+ */
 static struct scsi_pkt *
 mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
 	struct buf *bp, int cmdlen, int statuslen, int tgtlen,
@@ -1265,7 +1843,7 @@ mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
 	struct mrsas_instance	*instance;
 	struct scsi_pkt	*new_pkt;
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
 	instance = ADDR2MR(ap);
 
@@ -1327,14 +1905,31 @@ mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
 	return (pkt);
 }
 
+/*
+ * tran_start - transport a SCSI command to the addressed target
+ * @ap:
+ * @pkt:
+ *
+ * The tran_start() entry point for a SCSI HBA driver is called to transport a
+ * SCSI command to the addressed target. The SCSI command is described
+ * entirely within the scsi_pkt structure, which the target driver allocated
+ * through the HBA driver's tran_init_pkt() entry point. If the command
+ * involves a data transfer, DMA resources must also have been allocated for
+ * the scsi_pkt structure.
+ *
+ * Return Values :
+ *	TRAN_BUSY - request queue is full, no more free scbs
+ *	TRAN_ACCEPT - pkt has been submitted to the instance
+ */
 static int
 mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 {
-	uchar_t 	cmd_done = 0;
+	uchar_t		cmd_done = 0;
 
 	struct mrsas_instance	*instance = ADDR2MR(ap);
 	struct mrsas_cmd	*cmd;
 
+	con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 	if (instance->deadadapter == 1) {
 		con_log(CL_ANN1, (CE_WARN,
 		    "mrsas_tran_start: return TRAN_FATAL_ERROR "
@@ -1347,12 +1942,12 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 	}
 
 	if (instance->adapterresetinprogress) {
-		con_log(CL_ANN1, (CE_NOTE, "Reset flag set, "
+		con_log(CL_ANN1, (CE_NOTE, "mrsas_tran_start: Reset flag set, "
 		    "returning mfi_pkt and setting TRAN_BUSY\n"));
 		return (TRAN_BUSY);
 	}
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x time:%x",
+	con_log(CL_ANN1, (CE_CONT, "chkpnt:%s:%d:SCSI CDB[0]=0x%x time:%x",
 	    __func__, __LINE__, pkt->pkt_cdbp[0], pkt->pkt_time));
 
 	pkt->pkt_reason	= CMD_CMPLT;
@@ -1394,16 +1989,16 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 		/* Synchronize the Cmd frame for the controller */
 		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
 		    DDI_DMA_SYNC_FORDEV);
-		con_log(CL_ANN1, (CE_NOTE, "Push SCSI CDB[0]=0x%x"
+		con_log(CL_ANN, (CE_CONT, "issue_cmd_ppc: SCSI CDB[0]=0x%x"
 		    "cmd->index:%x\n", pkt->pkt_cdbp[0], cmd->index));
 		instance->func_ptr->issue_cmd(cmd, instance);
 
 	} else {
 		struct mrsas_header *hdr = &cmd->frame->hdr;
 
-		cmd->sync_cmd = MRSAS_TRUE;
+		/* cmd->sync_cmd = MRSAS_TRUE; */  /* KEBE asks, inherit? */
 
-		instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd);
+		instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd);
 
 		pkt->pkt_reason		= CMD_CMPLT;
 		pkt->pkt_statistics	= 0;
@@ -1416,7 +2011,8 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 			break;
 
 		case MFI_STAT_SCSI_DONE_WITH_ERROR:
-
+			con_log(CL_ANN, (CE_CONT,
+			    "mrsas_tran_start: scsi done with error"));
 			pkt->pkt_reason	= CMD_CMPLT;
 			pkt->pkt_statistics = 0;
 
@@ -1424,6 +2020,8 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 			break;
 
 		case MFI_STAT_DEVICE_NOT_FOUND:
+			con_log(CL_ANN, (CE_CONT,
+			    "mrsas_tran_start: device not found error"));
 			pkt->pkt_reason		= CMD_DEV_GONE;
 			pkt->pkt_statistics	= STAT_DISCON;
 			break;
@@ -1446,6 +2044,19 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
 	return (TRAN_ACCEPT);
 }
 
+/*
+ * tran_abort - Abort any commands that are currently in transport
+ * @ap:
+ * @pkt:
+ *
+ * The tran_abort() entry point for a SCSI HBA driver is called to abort any
+ * commands that are currently in transport for a particular target. This entry
+ * point is called when a target driver calls scsi_abort(). The tran_abort()
+ * entry point should attempt to abort the command denoted by the pkt
+ * parameter. If the pkt parameter is NULL, tran_abort() should attempt to
+ * abort all outstanding commands in the transport layer for the particular
+ * target or logical unit.
+ */
 /*ARGSUSED*/
 static int
 mrsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
@@ -1457,18 +2068,80 @@ mrsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
 	return (DDI_FAILURE);
 }
 
+/*
+ * tran_reset - reset either the SCSI bus or target
+ * @ap:
+ * @level:
+ *
+ * The tran_reset() entry point for a SCSI HBA driver is called to reset either
+ * the SCSI bus or a particular SCSI target device. This entry point is called
+ * when a target driver calls scsi_reset(). The tran_reset() entry point must
+ * reset the SCSI bus if level is RESET_ALL. If level is RESET_TARGET, just the
+ * particular target or logical unit must be reset.
+ */
 /*ARGSUSED*/
 static int
 mrsas_tran_reset(struct scsi_address *ap, int level)
 {
+	struct mrsas_instance *instance = ADDR2MR(ap);
+
 	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
-	/* reset command not supported by H/W */
+	if (wait_for_outstanding(instance)) {
+		con_log(CL_ANN1,
+		    (CE_CONT, "wait_for_outstanding: return FAIL.\n"));
+		return (DDI_FAILURE);
+	} else {
+		return (DDI_SUCCESS);
+	}
+}
 
-	return (DDI_FAILURE);
+#if 0
+/*
+ * tran_bus_reset - reset the SCSI bus
+ * @dip:
+ * @level:
+ *
+ * The tran_bus_reset() vector in the scsi_hba_tran structure should be
+ * initialized during the HBA driver's attach(). The vector should point to
+ * an HBA entry point that is to be called when a user initiates a bus reset.
+ * Implementation is hardware specific. If the HBA driver cannot reset the
+ * SCSI bus without affecting the targets, the driver should fail RESET_BUS
+ * or not initialize this vector.
+ */
+/*ARGSUSED*/
+static int
+mrsas_tran_bus_reset(dev_info_t *dip, int level)
+{
+	int	instance_no = ddi_get_instance(dip);
+
+	struct mrsas_instance	*instance = ddi_get_soft_state(mrsas_state,
+	    instance_no);
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
+	if (wait_for_outstanding(instance)) {
+		con_log(CL_ANN1,
+		    (CE_CONT, "wait_for_outstanding: return FAIL.\n"));
+		return (DDI_FAILURE);
+	} else {
+		return (DDI_SUCCESS);
+	}
 }
+#endif
 
+/*
+ * tran_getcap - get one of a set of SCSA-defined capabilities
+ * @ap:
+ * @cap:
+ * @whom:
+ *
+ * The target driver can request the current setting of the capability for a
+ * particular target by setting the whom parameter to nonzero. A whom value of
+ * zero indicates a request for the current setting of the general capability
+ * for the SCSI bus or for adapter hardware. The tran_getcap() should return -1
+ * for undefined capabilities or the current value of the requested capability.
+ */
 /*ARGSUSED*/
 static int
 mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
@@ -1477,7 +2150,7 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
 
 	struct mrsas_instance	*instance = ADDR2MR(ap);
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
 	/* we do allow inquiring about capabilities for other targets */
 	if (cap == NULL) {
@@ -1486,8 +2159,13 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
 
 	switch (scsi_hba_lookup_capstr(cap)) {
 	case SCSI_CAP_DMA_MAX:
-		/* Limit to 16MB max transfer */
-		rval = mrsas_max_cap_maxxfer;
+		if (instance->tbolt) {
+			/* Limit to 256k max transfer */
+			rval = mrsas_tbolt_max_cap_maxxfer;
+		} else {
+			/* Limit to 16MB max transfer */
+			rval = mrsas_max_cap_maxxfer;
+		}
 		break;
 	case SCSI_CAP_MSG_OUT:
 		rval = 1;
@@ -1536,13 +2214,29 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
 	return (rval);
 }
 
+/*
+ * tran_setcap - set one of a set of SCSA-defined capabilities
+ * @ap:
+ * @cap:
+ * @value:
+ * @whom:
+ *
+ * The target driver might request that the new value be set for a particular
+ * target by setting the whom parameter to nonzero. A whom value of zero
+ * means that request is to set the new value for the SCSI bus or for adapter
+ * hardware in general.
+ * The tran_setcap() should return the following values as appropriate:
+ * - -1 for undefined capabilities
+ * - 0 if the HBA driver cannot set the capability to the requested value
+ * - 1 if the HBA driver is able to set the capability to the requested value
+ */
 /*ARGSUSED*/
 static int
 mrsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
 {
 	int		rval = 1;
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
 	/* We don't allow setting capabilities for other targets */
 	if (cap == NULL || whom == 0) {
@@ -1584,12 +2278,25 @@ mrsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
 	return (rval);
 }
 
+/*
+ * tran_destroy_pkt - deallocate scsi_pkt structure
+ * @ap:
+ * @pkt:
+ *
+ * The tran_destroy_pkt() entry point is the HBA driver function that
+ * deallocates scsi_pkt structures. The tran_destroy_pkt() entry point is
+ * called when the target driver calls scsi_destroy_pkt(). The
+ * tran_destroy_pkt() entry point must free any DMA resources that have been
+ * allocated for the packet. An implicit DMA synchronization occurs if the
+ * DMA resources are freed and any cached data remains after the completion
+ * of the transfer.
+ */
 static void
 mrsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
 {
 	struct scsa_cmd *acmd = PKT2CMD(pkt);
 
-	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
 	if (acmd->cmd_flags & CFLAG_DMAVALID) {
 		acmd->cmd_flags &= ~CFLAG_DMAVALID;
@@ -1605,6 +2312,18 @@ mrsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
 	scsi_hba_pkt_free(ap, pkt);
 }
 
+/*
+ * tran_dmafree - deallocates DMA resources
+ * @ap:
+ * @pkt:
+ *
+ * The tran_dmafree() entry point deallocates DMAQ resources that have been
+ * allocated for a scsi_pkt structure. The tran_dmafree() entry point is
+ * called when the target driver calls scsi_dmafree(). The tran_dmafree() must
+ * free only DMA resources allocated for a scsi_pkt structure, not the
+ * scsi_pkt itself. When DMA resources are freed, a DMA synchronization is
+ * implicitly performed.
+ */
 /*ARGSUSED*/
 static void
 mrsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
@@ -1624,6 +2343,19 @@ mrsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
 	}
 }
 
+/*
+ * tran_sync_pkt - synchronize the DMA object allocated
+ * @ap:
+ * @pkt:
+ *
+ * The tran_sync_pkt() entry point synchronizes the DMA object allocated for
+ * the scsi_pkt structure before or after a DMA transfer. The tran_sync_pkt()
+ * entry point is called when the target driver calls scsi_sync_pkt(). If the
+ * data transfer direction is a DMA read from device to memory, tran_sync_pkt()
+ * must synchronize the CPU's view of the data. If the data transfer direction
+ * is a DMA write from memory to device, tran_sync_pkt() must synchronize the
+ * device's view of the data.
+ */
 /*ARGSUSED*/
 static void
 mrsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
@@ -1639,6 +2371,25 @@ mrsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
 	}
 }
 
+/*ARGSUSED*/
+static int
+mrsas_tran_quiesce(dev_info_t *dip)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	return (1);
+}
+
+/*ARGSUSED*/
+static int
+mrsas_tran_unquiesce(dev_info_t *dip)
+{
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+	return (1);
+}
+
+
 /*
  * mrsas_isr(caddr_t)
  *
@@ -1654,15 +2405,29 @@ mrsas_isr(struct mrsas_instance *instance)
 	uint32_t	producer;
 	uint32_t	consumer;
 	uint32_t	context;
+	int		retval;
 
 	struct mrsas_cmd	*cmd;
 	struct mrsas_header	*hdr;
 	struct scsi_pkt		*pkt;
 
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 	ASSERT(instance);
-	if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
-	    !instance->func_ptr->intr_ack(instance)) {
-		return (DDI_INTR_UNCLAIMED);
+	if (instance->tbolt) {
+		mutex_enter(&instance->chip_mtx);
+		if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+		    !(instance->func_ptr->intr_ack(instance))) {
+			mutex_exit(&instance->chip_mtx);
+			return (DDI_INTR_UNCLAIMED);
+		}
+		retval = mr_sas_tbolt_process_outstanding_cmd(instance);
+		mutex_exit(&instance->chip_mtx);
+		return (retval);
+	} else {
+		if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+		    !instance->func_ptr->intr_ack(instance)) {
+			return (DDI_INTR_UNCLAIMED);
+		}
 	}
 
 	(void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
@@ -1681,7 +2446,7 @@ mrsas_isr(struct mrsas_instance *instance)
 #ifdef OCRDEBUG
 	if (debug_consecutive_timeout_after_ocr_g == 1) {
 		con_log(CL_ANN1, (CE_NOTE,
-		"simulating consecutive timeout after ocr"));
+		    "simulating consecutive timeout after ocr"));
 		return (DDI_INTR_CLAIMED);
 	}
 #endif
@@ -1694,10 +2459,10 @@ mrsas_isr(struct mrsas_instance *instance)
 	consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
 	    instance->consumer);
 
-	con_log(CL_ANN1, (CE_NOTE, " producer %x consumer %x ",
+	con_log(CL_ANN, (CE_CONT, " producer %x consumer %x ",
 	    producer, consumer));
 	if (producer == consumer) {
-		con_log(CL_ANN1, (CE_WARN, "producer =  consumer case"));
+		con_log(CL_ANN, (CE_WARN, "producer ==	consumer case"));
 		DTRACE_PROBE2(isr_pc_err, uint32_t, producer,
 		    uint32_t, consumer);
 		mutex_exit(&instance->cmd_pend_mtx);
@@ -1711,10 +2476,10 @@ mrsas_isr(struct mrsas_instance *instance)
 		cmd = instance->cmd_list[context];
 
 		if (cmd->sync_cmd == MRSAS_TRUE) {
-		hdr = (struct mrsas_header *)&cmd->frame->hdr;
-		if (hdr) {
-			mlist_del_init(&cmd->list);
-		}
+			hdr = (struct mrsas_header *)&cmd->frame->hdr;
+			if (hdr) {
+				mlist_del_init(&cmd->list);
+			}
 		} else {
 			pkt = cmd->pkt;
 			if (pkt) {
@@ -1761,9 +2526,9 @@ mrsas_isr(struct mrsas_instance *instance)
 
 /*
  * ************************************************************************** *
- *                                                                            *
- *                                  libraries                                 *
- *                                                                            *
+ *									      *
+ *				    libraries				      *
+ *									      *
  * ************************************************************************** *
  */
 /*
@@ -1779,7 +2544,7 @@ mrsas_isr(struct mrsas_instance *instance)
 static struct mrsas_cmd *
 get_mfi_pkt(struct mrsas_instance *instance)
 {
-	mlist_t 		*head = &instance->cmd_pool_list;
+	mlist_t			*head = &instance->cmd_pool_list;
 	struct mrsas_cmd	*cmd = NULL;
 
 	mutex_enter(&instance->cmd_pool_mtx);
@@ -1793,6 +2558,7 @@ get_mfi_pkt(struct mrsas_instance *instance)
 		cmd->pkt = NULL;
 		cmd->retry_count_for_ocr = 0;
 		cmd->drv_pkt_time = 0;
+
 	}
 	mutex_exit(&instance->cmd_pool_mtx);
 
@@ -1812,8 +2578,12 @@ get_mfi_app_pkt(struct mrsas_instance *instance)
 		cmd = mlist_entry(head->next, struct mrsas_cmd, list);
 		mlist_del_init(head->next);
 	}
-	if (cmd != NULL)
+	if (cmd != NULL) {
 		cmd->pkt = NULL;
+		cmd->retry_count_for_ocr = 0;
+		cmd->drv_pkt_time = 0;
+	}
+
 	mutex_exit(&instance->app_cmd_pool_mtx);
 
 	return (cmd);
@@ -1842,12 +2612,12 @@ return_mfi_app_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 
 	mutex_exit(&instance->app_cmd_pool_mtx);
 }
-static void
+void
 push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 {
 	struct scsi_pkt *pkt;
 	struct mrsas_header	*hdr;
-	con_log(CL_ANN1, (CE_NOTE, "push_pending_pkt(): Called\n"));
+	con_log(CL_DLEVEL2, (CE_NOTE, "push_pending_pkt(): Called\n"));
 	mutex_enter(&instance->cmd_pend_mtx);
 	ASSERT(mutex_owned(&instance->cmd_pend_mtx));
 	mlist_del_init(&cmd->list);
@@ -1861,15 +2631,15 @@ push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 			    "time %llx",
 			    (void *)cmd, cmd->index,
 			    gethrtime()));
-			/* Wait for specified interval  */
+			/* Wait for specified interval	*/
 			cmd->drv_pkt_time = ddi_get16(
 			    cmd->frame_dma_obj.acc_handle, &hdr->timeout);
 			if (cmd->drv_pkt_time < debug_timeout_g)
 				cmd->drv_pkt_time = (uint16_t)debug_timeout_g;
-			con_log(CL_ANN1, (CE_CONT,
-			    "push_pending_pkt(): "
-			    "Called IO Timeout Value %x\n",
-			    cmd->drv_pkt_time));
+				con_log(CL_ANN1, (CE_CONT,
+				    "push_pending_pkt(): "
+				    "Called IO Timeout Value %x\n",
+				    cmd->drv_pkt_time));
 		}
 		if (hdr && instance->timeout_id == (timeout_id_t)-1) {
 			instance->timeout_id = timeout(io_timeout_checker,
@@ -1893,9 +2663,10 @@ push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 	}
 
 	mutex_exit(&instance->cmd_pend_mtx);
+
 }
 
-static int
+int
 mrsas_print_pending_cmds(struct mrsas_instance *instance)
 {
 	mlist_t *head = &instance->cmd_pend_list;
@@ -1903,47 +2674,73 @@ mrsas_print_pending_cmds(struct mrsas_instance *instance)
 	struct mrsas_cmd *cmd = NULL;
 	struct mrsas_header	*hdr;
 	unsigned int		flag = 1;
-
 	struct scsi_pkt *pkt;
-	con_log(CL_ANN1, (CE_NOTE,
-	    "mrsas_print_pending_cmds(): Called"));
+	int saved_level;
+	int cmd_count = 0;
+
+
+	saved_level = debug_level_g;
+	debug_level_g = CL_ANN1;
+
+	cmn_err(CE_NOTE, "mrsas_print_pending_cmds(): Called\n");
+
 	while (flag) {
 		mutex_enter(&instance->cmd_pend_mtx);
 		tmp	=	tmp->next;
 		if (tmp == head) {
 			mutex_exit(&instance->cmd_pend_mtx);
 			flag = 0;
+			con_log(CL_ANN1, (CE_CONT, "mrsas_print_pending_cmds():"
+			    " NO MORE CMDS PENDING....\n"));
 			break;
 		} else {
 			cmd = mlist_entry(tmp, struct mrsas_cmd, list);
 			mutex_exit(&instance->cmd_pend_mtx);
 			if (cmd) {
 				if (cmd->sync_cmd == MRSAS_TRUE) {
-				hdr = (struct mrsas_header *)&cmd->frame->hdr;
+					hdr = (struct mrsas_header *)
+					    &cmd->frame->hdr;
 					if (hdr) {
-					con_log(CL_ANN1, (CE_CONT,
-					    "print: cmd %p index %x hdr %p",
-					    (void *)cmd, cmd->index,
-					    (void *)hdr));
+						con_log(CL_ANN1, (CE_CONT,
+						    "print: cmd %p index 0x%x "
+						    "drv_pkt_time 0x%x (NO-PKT)"
+						    " hdr %p\n", (void *)cmd,
+						    cmd->index,
+						    cmd->drv_pkt_time,
+						    (void *)hdr));
 					}
 				} else {
 					pkt = cmd->pkt;
 					if (pkt) {
 					con_log(CL_ANN1, (CE_CONT,
-					    "print: cmd %p index %x "
-					    "pkt %p", (void *)cmd, cmd->index,
-					    (void *)pkt));
+					    "print: cmd %p index 0x%x "
+					    "drv_pkt_time 0x%x pkt %p \n",
+					    (void *)cmd, cmd->index,
+					    cmd->drv_pkt_time, (void *)pkt));
 					}
 				}
+
+				if (++cmd_count == 1) {
+					mrsas_print_cmd_details(instance, cmd,
+					    0xDD);
+				} else {
+					mrsas_print_cmd_details(instance, cmd,
+					    1);
+				}
+
 			}
 		}
 	}
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_print_pending_cmds(): Done\n"));
+	con_log(CL_ANN1, (CE_CONT, "mrsas_print_pending_cmds(): Done\n"));
+
+
+	debug_level_g = saved_level;
+
 	return (DDI_SUCCESS);
 }
 
 
-static int
+int
 mrsas_complete_pending_cmds(struct mrsas_instance *instance)
 {
 
@@ -1968,7 +2765,7 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance)
 					    = CMD_DEV_GONE;
 					pkt->pkt_statistics
 					    = STAT_DISCON;
-					con_log(CL_ANN1, (CE_NOTE,
+					con_log(CL_ANN1, (CE_CONT,
 					    "fail and posting to scsa "
 					    "cmd %p index %x"
 					    " pkt %p "
@@ -1980,7 +2777,7 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance)
 			} else { /* for DCMDS */
 				if (cmd->sync_cmd == MRSAS_TRUE) {
 				hdr = (struct mrsas_header *)&cmd->frame->hdr;
-				con_log(CL_ANN1, (CE_NOTE,
+				con_log(CL_ANN1, (CE_CONT,
 				    "posting invalid status to application "
 				    "cmd %p index %x"
 				    " hdr %p "
@@ -1993,22 +2790,92 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance)
 			}
 			mlist_del_init(&cmd->list);
 		} else {
-			con_log(CL_ANN1, (CE_NOTE,
+			con_log(CL_ANN1, (CE_CONT,
 			    "mrsas_complete_pending_cmds:"
 			    "NULL command\n"));
 		}
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "mrsas_complete_pending_cmds:"
 		    "looping for more commands\n"));
 	}
 	mutex_exit(&instance->cmd_pend_mtx);
 
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_complete_pending_cmds(): DONE\n"));
+	con_log(CL_ANN1, (CE_CONT, "mrsas_complete_pending_cmds(): DONE\n"));
 	return (DDI_SUCCESS);
 }
 
+void
+mrsas_print_cmd_details(struct mrsas_instance *instance, struct mrsas_cmd *cmd,
+    int detail)
+{
+	struct scsi_pkt *pkt = cmd->pkt;
+	Mpi2RaidSCSIIORequest_t *scsi_io = cmd->scsi_io_request;
+	int i;
+	int saved_level;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
 
-static int
+	if (detail == 0xDD) {
+		saved_level = debug_level_g;
+		debug_level_g = CL_ANN1;
+	}
+
+
+	if (instance->tbolt) {
+		con_log(CL_ANN1, (CE_CONT, "print_cmd_details: cmd %p "
+		    "cmd->index 0x%x SMID 0x%x timer 0x%x sec\n",
+		    (void *)cmd, cmd->index, cmd->SMID, cmd->drv_pkt_time));
+	} else {
+		con_log(CL_ANN1, (CE_CONT, "print_cmd_details: cmd %p "
+		    "cmd->index 0x%x timer 0x%x sec\n",
+		    (void *)cmd, cmd->index, cmd->drv_pkt_time));
+	}
+
+	if (pkt) {
+		con_log(CL_ANN1, (CE_CONT, "scsi_pkt CDB[0]=0x%x",
+		    pkt->pkt_cdbp[0]));
+	} else {
+		con_log(CL_ANN1, (CE_CONT, "NO-PKT"));
+	}
+
+	if ((detail == 0xDD) && instance->tbolt) {
+		con_log(CL_ANN1, (CE_CONT, "RAID_SCSI_IO_REQUEST\n"));
+		con_log(CL_ANN1, (CE_CONT, "DevHandle=0x%X Function=0x%X "
+		    "IoFlags=0x%X SGLFlags=0x%X DataLength=0x%X\n",
+		    ddi_get16(acc_handle, &scsi_io->DevHandle),
+		    ddi_get8(acc_handle, &scsi_io->Function),
+		    ddi_get16(acc_handle, &scsi_io->IoFlags),
+		    ddi_get16(acc_handle, &scsi_io->SGLFlags),
+		    ddi_get32(acc_handle, &scsi_io->DataLength)));
+
+		for (i = 0; i < 32; i++) {
+			con_log(CL_ANN1, (CE_CONT, "CDB[%d]=0x%x ", i,
+			    ddi_get8(acc_handle, &scsi_io->CDB.CDB32[i])));
+		}
+
+		con_log(CL_ANN1, (CE_CONT, "RAID-CONTEXT\n"));
+		con_log(CL_ANN1, (CE_CONT, "status=0x%X extStatus=0x%X "
+		    "ldTargetId=0x%X timeoutValue=0x%X regLockFlags=0x%X "
+		    "RAIDFlags=0x%X regLockRowLBA=0x%" PRIu64
+		    " regLockLength=0x%X spanArm=0x%X\n",
+		    ddi_get8(acc_handle, &scsi_io->RaidContext.status),
+		    ddi_get8(acc_handle, &scsi_io->RaidContext.extStatus),
+		    ddi_get16(acc_handle, &scsi_io->RaidContext.ldTargetId),
+		    ddi_get16(acc_handle, &scsi_io->RaidContext.timeoutValue),
+		    ddi_get8(acc_handle, &scsi_io->RaidContext.regLockFlags),
+		    ddi_get8(acc_handle, &scsi_io->RaidContext.RAIDFlags),
+		    ddi_get64(acc_handle, &scsi_io->RaidContext.regLockRowLBA),
+		    ddi_get32(acc_handle, &scsi_io->RaidContext.regLockLength),
+		    ddi_get8(acc_handle, &scsi_io->RaidContext.spanArm)));
+	}
+
+	if (detail == 0xDD) {
+		debug_level_g = saved_level;
+	}
+}
+
+
+int
 mrsas_issue_pending_cmds(struct mrsas_instance *instance)
 {
 	mlist_t *head	=	&instance->cmd_pend_list;
@@ -2023,53 +2890,79 @@ mrsas_issue_pending_cmds(struct mrsas_instance *instance)
 		tmp = tmp->next;
 		mutex_exit(&instance->cmd_pend_mtx);
 		if (cmd) {
-			con_log(CL_ANN1, (CE_NOTE,
+			con_log(CL_ANN1, (CE_CONT,
 			    "mrsas_issue_pending_cmds(): "
-			    "Got a cmd: cmd:%p\n", (void *)cmd));
+			    "Got a cmd: cmd %p index 0x%x drv_pkt_time 0x%x ",
+			    (void *)cmd, cmd->index, cmd->drv_pkt_time));
+
+			/* Reset command timeout value */
+			if (cmd->drv_pkt_time < debug_timeout_g)
+				cmd->drv_pkt_time = (uint16_t)debug_timeout_g;
+
 			cmd->retry_count_for_ocr++;
-			con_log(CL_ANN1, (CE_NOTE,
-			    "mrsas_issue_pending_cmds(): "
-			    "cmd retry count = %d\n",
-			    cmd->retry_count_for_ocr));
+
+			cmn_err(CE_CONT, "cmd retry count = %d\n",
+			    cmd->retry_count_for_ocr);
+
 			if (cmd->retry_count_for_ocr > IO_RETRY_COUNT) {
-				con_log(CL_ANN1, (CE_NOTE,
+				cmn_err(CE_WARN, "mrsas_issue_pending_cmds(): "
+				    "cmd->retry_count exceeded limit >%d\n",
+				    IO_RETRY_COUNT);
+				mrsas_print_cmd_details(instance, cmd, 0xDD);
+
+				cmn_err(CE_WARN,
 				    "mrsas_issue_pending_cmds():"
-				    "Calling Kill Adapter\n"));
-				(void) mrsas_kill_adapter(instance);
+				    "Calling KILL Adapter\n");
+				if (instance->tbolt)
+					mrsas_tbolt_kill_adapter(instance);
+				else
+					(void) mrsas_kill_adapter(instance);
 				return (DDI_FAILURE);
 			}
+
 			pkt = cmd->pkt;
 			if (pkt) {
-				con_log(CL_ANN1, (CE_NOTE,
-				    "PENDING ISSUE: cmd %p index %x "
+				con_log(CL_ANN1, (CE_CONT,
+				    "PENDING PKT-CMD ISSUE: cmd %p index %x "
 				    "pkt %p time %llx",
 				    (void *)cmd, cmd->index,
 				    (void *)pkt,
 				    gethrtime()));
 
+			} else {
+				cmn_err(CE_CONT,
+				    "mrsas_issue_pending_cmds(): NO-PKT, "
+				    "cmd %p index 0x%x drv_pkt_time 0x%x ",
+				    (void *)cmd, cmd->index, cmd->drv_pkt_time);
 			}
+
+
 			if (cmd->sync_cmd == MRSAS_TRUE) {
+				cmn_err(CE_CONT, "mrsas_issue_pending_cmds(): "
+				    "SYNC_CMD == TRUE \n");
 				instance->func_ptr->issue_cmd_in_sync_mode(
 				    instance, cmd);
 			} else {
 				instance->func_ptr->issue_cmd(cmd, instance);
 			}
 		} else {
-			con_log(CL_ANN1, (CE_NOTE,
+			con_log(CL_ANN1, (CE_CONT,
 			    "mrsas_issue_pending_cmds: NULL command\n"));
 		}
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "mrsas_issue_pending_cmds:"
 		    "looping for more commands"));
 	}
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_issue_pending_cmds(): DONE\n"));
+	con_log(CL_ANN1, (CE_CONT, "mrsas_issue_pending_cmds(): DONE\n"));
 	return (DDI_SUCCESS);
 }
 
+
+
 /*
  * destroy_mfi_frame_pool
  */
-static void
+void
 destroy_mfi_frame_pool(struct mrsas_instance *instance)
 {
 	int		i;
@@ -2078,7 +2971,8 @@ destroy_mfi_frame_pool(struct mrsas_instance *instance)
 	struct mrsas_cmd	*cmd;
 
 	/* return all frames to pool */
-	for (i = 0; i < max_cmd+1; i++) {
+
+	for (i = 0; i < max_cmd; i++) {
 
 		cmd = instance->cmd_list[i];
 
@@ -2093,7 +2987,7 @@ destroy_mfi_frame_pool(struct mrsas_instance *instance)
 /*
  * create_mfi_frame_pool
  */
-static int
+int
 create_mfi_frame_pool(struct mrsas_instance *instance)
 {
 	int		i = 0;
@@ -2103,11 +2997,10 @@ create_mfi_frame_pool(struct mrsas_instance *instance)
 	uint32_t	sgl_sz;
 	uint32_t	tot_frame_size;
 	struct mrsas_cmd	*cmd;
+	int			retval = DDI_SUCCESS;
 
 	max_cmd = instance->max_fw_cmds;
-
 	sge_sz	= sizeof (struct mrsas_sge_ieee);
-
 	/* calculated the number of 64byte frames required for SGL */
 	sgl_sz		= sge_sz * instance->max_num_sge;
 	tot_frame_size	= sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
@@ -2115,7 +3008,7 @@ create_mfi_frame_pool(struct mrsas_instance *instance)
 	con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
 	    "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
 
-	while (i < max_cmd+1) {
+	while (i < max_cmd) {
 		cmd = instance->cmd_list[i];
 
 		cmd->frame_dma_obj.size	= tot_frame_size;
@@ -2125,14 +3018,14 @@ create_mfi_frame_pool(struct mrsas_instance *instance)
 		cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
 		cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
 
-
 		cookie_cnt = mrsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
 		    (uchar_t)DDI_STRUCTURE_LE_ACC);
 
 		if (cookie_cnt == -1 || cookie_cnt > 1) {
-			con_log(CL_ANN, (CE_WARN,
-			    "create_mfi_frame_pool: could not alloc."));
-			return (DDI_FAILURE);
+			cmn_err(CE_WARN,
+			    "create_mfi_frame_pool: could not alloc.");
+			retval = DDI_FAILURE;
+			goto mrsas_undo_frame_pool;
 		}
 
 		bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
@@ -2150,10 +3043,10 @@ create_mfi_frame_pool(struct mrsas_instance *instance)
 		    tot_frame_size - SENSE_LENGTH;
 
 		if (!cmd->frame || !cmd->sense) {
-			con_log(CL_ANN, (CE_NOTE,
-			    "mr_sas: pci_pool_alloc failed"));
-
-			return (ENOMEM);
+			cmn_err(CE_WARN,
+			    "mr_sas: pci_pool_alloc failed");
+			retval = ENOMEM;
+			goto mrsas_undo_frame_pool;
 		}
 
 		ddi_put32(cmd->frame_dma_obj.acc_handle,
@@ -2165,6 +3058,12 @@ create_mfi_frame_pool(struct mrsas_instance *instance)
 	}
 
 	return (DDI_SUCCESS);
+
+mrsas_undo_frame_pool:
+	if (i > 0)
+		destroy_mfi_frame_pool(instance);
+
+	return (retval);
 }
 
 /*
@@ -2207,8 +3106,8 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance)
 
 	if (mrsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
 	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
-		con_log(CL_ANN, (CE_WARN,
-		    "mr_sas: could not alloc reply queue"));
+		cmn_err(CE_WARN,
+		    "mr_sas: could not alloc reply queue");
 		return (DDI_FAILURE);
 	}
 
@@ -2240,9 +3139,9 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance)
 
 	if (mrsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
 	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
-		con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
-		    "could not allocate data transfer buffer."));
-		return (DDI_FAILURE);
+		cmn_err(CE_WARN, "alloc_additional_dma_buffer: "
+		    "could not allocate data transfer buffer.");
+		goto mrsas_undo_internal_buff;
 	}
 
 	bzero(instance->mfi_evt_detail_obj.buffer,
@@ -2251,53 +3150,70 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance)
 	instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
 
 	return (DDI_SUCCESS);
+
+mrsas_undo_internal_buff:
+	if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->mfi_internal_dma_obj);
+		instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	return (DDI_FAILURE);
 }
 
-/*
- * free_space_for_mfi
- */
-static void
-free_space_for_mfi(struct mrsas_instance *instance)
+
+void
+mrsas_free_cmd_pool(struct mrsas_instance *instance)
 {
 	int		i;
-	uint32_t	max_cmd = instance->max_fw_cmds;
+	uint32_t	max_cmd;
+	size_t		sz;
 
 	/* already freed */
 	if (instance->cmd_list == NULL) {
 		return;
 	}
 
-	free_additional_dma_buffer(instance);
+	max_cmd = instance->max_fw_cmds;
 
-	/* first free the MFI frame pool */
-	destroy_mfi_frame_pool(instance);
+	/* size of cmd_list array */
+	sz = sizeof (struct mrsas_cmd *) * max_cmd;
 
-	/* free all the commands in the cmd_list */
-	for (i = 0; i < instance->max_fw_cmds+1; i++) {
-		kmem_free(instance->cmd_list[i],
-		    sizeof (struct mrsas_cmd));
+	/* First free each cmd */
+	for (i = 0; i < max_cmd; i++) {
+		if (instance->cmd_list[i] != NULL) {
+			kmem_free(instance->cmd_list[i],
+			    sizeof (struct mrsas_cmd));
+		}
 
 		instance->cmd_list[i] = NULL;
 	}
 
-	/* free the cmd_list buffer itself */
-	kmem_free(instance->cmd_list,
-	    sizeof (struct mrsas_cmd *) * (max_cmd+1));
+	/* Now, free cmd_list array */
+	if (instance->cmd_list != NULL)
+		kmem_free(instance->cmd_list, sz);
 
 	instance->cmd_list = NULL;
 
 	INIT_LIST_HEAD(&instance->cmd_pool_list);
-	INIT_LIST_HEAD(&instance->app_cmd_pool_list);
 	INIT_LIST_HEAD(&instance->cmd_pend_list);
+	if (instance->tbolt) {
+		INIT_LIST_HEAD(&instance->cmd_app_pool_list);
+	} else {
+		INIT_LIST_HEAD(&instance->app_cmd_pool_list);
+	}
+
 }
 
+
 /*
- * alloc_space_for_mfi
+ * mrsas_alloc_cmd_pool
  */
-static int
-alloc_space_for_mfi(struct mrsas_instance *instance)
+int
+mrsas_alloc_cmd_pool(struct mrsas_instance *instance)
 {
 	int		i;
+	int		count;
 	uint32_t	max_cmd;
 	uint32_t	reserve_cmd;
 	size_t		sz;
@@ -2305,9 +3221,11 @@ alloc_space_for_mfi(struct mrsas_instance *instance)
 	struct mrsas_cmd	*cmd;
 
 	max_cmd = instance->max_fw_cmds;
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_alloc_cmd_pool: "
+	    "max_cmd %x", max_cmd));
 
-	/* reserve 1 more slot for flush_cache */
-	sz = sizeof (struct mrsas_cmd *) * (max_cmd+1);
+
+	sz = sizeof (struct mrsas_cmd *) * max_cmd;
 
 	/*
 	 * instance->cmd_list is an array of struct mrsas_cmd pointers.
@@ -2315,54 +3233,127 @@ alloc_space_for_mfi(struct mrsas_instance *instance)
 	 * commands.
 	 */
 	instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
-	ASSERT(instance->cmd_list);
+	if (instance->cmd_list == NULL) {
+		con_log(CL_NONE, (CE_WARN,
+		    "Failed to allocate memory for cmd_list"));
+		return (DDI_FAILURE);
+	}
 
-	for (i = 0; i < max_cmd+1; i++) {
-		instance->cmd_list[i] = kmem_zalloc(sizeof (struct mrsas_cmd),
-		    KM_SLEEP);
-		ASSERT(instance->cmd_list[i]);
+	/* create a frame pool and assign one frame to each cmd */
+	for (count = 0; count < max_cmd; count++) {
+		instance->cmd_list[count] =
+		    kmem_zalloc(sizeof (struct mrsas_cmd), KM_SLEEP);
+		if (instance->cmd_list[count] == NULL) {
+			con_log(CL_NONE, (CE_WARN,
+			    "Failed to allocate memory for mrsas_cmd"));
+			goto mrsas_undo_cmds;
+		}
 	}
 
+	/* add all the commands to command pool */
+
 	INIT_LIST_HEAD(&instance->cmd_pool_list);
 	INIT_LIST_HEAD(&instance->cmd_pend_list);
-	/* add all the commands to command pool (instance->cmd_pool) */
-	reserve_cmd	=	APP_RESERVE_CMDS;
 	INIT_LIST_HEAD(&instance->app_cmd_pool_list);
-	for (i = 0; i < reserve_cmd-1; i++) {
-		cmd	= instance->cmd_list[i];
-		cmd->index	= i;
+
+	reserve_cmd = MRSAS_APP_RESERVED_CMDS;
+
+	for (i = 0; i < reserve_cmd; i++) {
+		cmd = instance->cmd_list[i];
+		cmd->index = i;
 		mlist_add_tail(&cmd->list, &instance->app_cmd_pool_list);
 	}
-	/*
-	 * reserve slot instance->cmd_list[APP_RESERVE_CMDS-1]
-	 * for abort_aen_cmd
-	 */
+
+
 	for (i = reserve_cmd; i < max_cmd; i++) {
-		cmd			= instance->cmd_list[i];
-		cmd->index	= i;
+		cmd = instance->cmd_list[i];
+		cmd->index = i;
 		mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
 	}
 
-	/* single slot for flush_cache won't be added in command pool */
-	cmd		= instance->cmd_list[max_cmd];
-	cmd->index	= i;
+	return (DDI_SUCCESS);
 
-	/* create a frame pool and assign one frame to each cmd */
-	if (create_mfi_frame_pool(instance)) {
-		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+mrsas_undo_cmds:
+	if (count > 0) {
+		/* free each cmd */
+		for (i = 0; i < count; i++) {
+			if (instance->cmd_list[i] != NULL) {
+				kmem_free(instance->cmd_list[i],
+				    sizeof (struct mrsas_cmd));
+			}
+			instance->cmd_list[i] = NULL;
+		}
+	}
+
+mrsas_undo_cmd_list:
+	if (instance->cmd_list != NULL)
+		kmem_free(instance->cmd_list, sz);
+	instance->cmd_list = NULL;
+
+	return (DDI_FAILURE);
+}
+
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct mrsas_instance *instance)
+{
+
+	/* already freed */
+	if (instance->cmd_list == NULL) {
+		return;
+	}
+
+	/* Free additional dma buffer */
+	free_additional_dma_buffer(instance);
+
+	/* Free the MFI frame pool */
+	destroy_mfi_frame_pool(instance);
+
+	/* Free all the commands in the cmd_list */
+	/* Free the cmd_list buffer itself */
+	mrsas_free_cmd_pool(instance);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct mrsas_instance *instance)
+{
+	/* Allocate command pool (memory for cmd_list & individual commands) */
+	if (mrsas_alloc_cmd_pool(instance)) {
+		cmn_err(CE_WARN, "error creating cmd pool");
 		return (DDI_FAILURE);
 	}
 
-	/* create a frame pool and assign one frame to each cmd */
+	/* Allocate MFI Frame pool */
+	if (create_mfi_frame_pool(instance)) {
+		cmn_err(CE_WARN, "error creating frame DMA pool");
+		goto mfi_undo_cmd_pool;
+	}
+
+	/* Allocate additional DMA buffer */
 	if (alloc_additional_dma_buffer(instance)) {
-		con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
-		return (DDI_FAILURE);
+		cmn_err(CE_WARN, "error creating frame DMA pool");
+		goto mfi_undo_frame_pool;
 	}
 
 	return (DDI_SUCCESS);
+
+mfi_undo_frame_pool:
+	destroy_mfi_frame_pool(instance);
+
+mfi_undo_cmd_pool:
+	mrsas_free_cmd_pool(instance);
+
+	return (DDI_FAILURE);
 }
 
 
+
 /*
  * get_ctrl_info
  */
@@ -2376,7 +3367,11 @@ get_ctrl_info(struct mrsas_instance *instance,
 	struct mrsas_dcmd_frame	*dcmd;
 	struct mrsas_ctrl_info	*ci;
 
-	cmd = get_mfi_pkt(instance);
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 
 	if (!cmd) {
 		con_log(CL_ANN, (CE_WARN,
@@ -2385,7 +3380,7 @@ get_ctrl_info(struct mrsas_instance *instance,
 		    uint16_t, instance->max_fw_cmds);
 		return (DDI_FAILURE);
 	}
-	cmd->retry_count_for_ocr = 0;
+
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
@@ -2396,8 +3391,8 @@ get_ctrl_info(struct mrsas_instance *instance,
 	ci = (struct mrsas_ctrl_info *)instance->internal_buf;
 
 	if (!ci) {
-		con_log(CL_ANN, (CE_WARN,
-		    "Failed to alloc mem for ctrl info"));
+		cmn_err(CE_WARN,
+		    "Failed to alloc mem for ctrl info");
 		return_mfi_pkt(instance, cmd);
 		return (DDI_FAILURE);
 	}
@@ -2425,33 +3420,40 @@ get_ctrl_info(struct mrsas_instance *instance,
 
 	cmd->frame_count = 1;
 
-	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
-	ret = 0;
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
 
-	ctrl_info->max_request_size = ddi_get32(
-	    cmd->frame_dma_obj.acc_handle, &ci->max_request_size);
+	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		ret = 0;
 
-	ctrl_info->ld_present_count = ddi_get16(
-	    cmd->frame_dma_obj.acc_handle, &ci->ld_present_count);
+		ctrl_info->max_request_size = ddi_get32(
+		    cmd->frame_dma_obj.acc_handle, &ci->max_request_size);
 
-	ctrl_info->properties.on_off_properties =
-	    ddi_get32(cmd->frame_dma_obj.acc_handle,
-	    &ci->properties.on_off_properties);
+		ctrl_info->ld_present_count = ddi_get16(
+		    cmd->frame_dma_obj.acc_handle, &ci->ld_present_count);
 
-	ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
-	    (uint8_t *)(ctrl_info->product_name),
-	    (uint8_t *)(ci->product_name), 80 * sizeof (char),
-	    DDI_DEV_AUTOINCR);
-	/* should get more members of ci with ddi_get when needed */
+		ctrl_info->properties.on_off_properties = ddi_get32(
+		    cmd->frame_dma_obj.acc_handle,
+		    &ci->properties.on_off_properties);
+		ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+		    (uint8_t *)(ctrl_info->product_name),
+		    (uint8_t *)(ci->product_name), 80 * sizeof (char),
+		    DDI_DEV_AUTOINCR);
+		/* should get more members of ci with ddi_get when needed */
 	} else {
-		con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+		cmn_err(CE_WARN, "get_ctrl_info: Ctrl info failed");
 		ret = -1;
 	}
 
 	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) {
 		ret = -1;
 	}
-	return_mfi_pkt(instance, cmd);
+	if (instance->tbolt) {
+		return_raid_msg_mfi_pkt(instance, cmd);
+	} else {
+		return_mfi_pkt(instance, cmd);
+	}
 
 	return (ret);
 }
@@ -2468,7 +3470,13 @@ abort_aen_cmd(struct mrsas_instance *instance,
 	struct mrsas_cmd		*cmd;
 	struct mrsas_abort_frame	*abort_fr;
 
-	cmd = instance->cmd_list[APP_RESERVE_CMDS-1];
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt: abort_aen:%d", __LINE__));
+
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 
 	if (!cmd) {
 		con_log(CL_ANN1, (CE_WARN,
@@ -2477,7 +3485,7 @@ abort_aen_cmd(struct mrsas_instance *instance,
 		    uint16_t, instance->max_fw_cmds);
 		return (DDI_FAILURE);
 	}
-	cmd->retry_count_for_ocr = 0;
+
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
@@ -2500,9 +3508,13 @@ abort_aen_cmd(struct mrsas_instance *instance,
 
 	instance->aen_cmd->abort_aen = 1;
 
-	cmd->sync_cmd = MRSAS_TRUE;
+	/* cmd->sync_cmd = MRSAS_TRUE; */ /* KEBE ASKS, inherit? */
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
 		con_log(CL_ANN1, (CE_WARN,
 		    "abort_aen_cmd: issue_cmd_in_poll_mode failed"));
@@ -2514,49 +3526,27 @@ abort_aen_cmd(struct mrsas_instance *instance,
 	instance->aen_cmd->abort_aen = 1;
 	instance->aen_cmd = 0;
 
+	if (instance->tbolt) {
+		return_raid_msg_mfi_pkt(instance, cmd);
+	} else {
+		return_mfi_pkt(instance, cmd);
+	}
+
 	atomic_add_16(&instance->fw_outstanding, (-1));
 
 	return (ret);
 }
 
 
-/*
- * init_mfi
- */
 static int
-init_mfi(struct mrsas_instance *instance)
+mrsas_build_init_cmd(struct mrsas_instance *instance,
+    struct mrsas_cmd **cmd_ptr)
 {
 	struct mrsas_cmd		*cmd;
-	struct mrsas_ctrl_info		ctrl_info;
 	struct mrsas_init_frame		*init_frame;
 	struct mrsas_init_queue_info	*initq_info;
+	struct mrsas_drv_ver		drv_ver_info;
 
-	/* we expect the FW state to be READY */
-	if (mfi_state_transition_to_ready(instance)) {
-		con_log(CL_ANN, (CE_WARN, "mr_sas: F/W is not ready"));
-		goto fail_ready_state;
-	}
-
-	/* get various operational parameters from status register */
-	instance->max_num_sge =
-	    (instance->func_ptr->read_fw_status_reg(instance) &
-	    0xFF0000) >> 0x10;
-	/*
-	 * Reduce the max supported cmds by 1. This is to ensure that the
-	 * reply_q_sz (1 more than the max cmd that driver may send)
-	 * does not exceed max cmds that the FW can support
-	 */
-	instance->max_fw_cmds =
-	    instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
-	instance->max_fw_cmds = instance->max_fw_cmds - 1;
-
-	instance->max_num_sge =
-	    (instance->max_num_sge > MRSAS_MAX_SGE_CNT) ?
-	    MRSAS_MAX_SGE_CNT : instance->max_num_sge;
-
-	/* create a pool of commands */
-	if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
-		goto fail_alloc_fw_space;
 
 	/*
 	 * Prepare a init frame. Note the init frame points to queue info
@@ -2564,8 +3554,8 @@ init_mfi(struct mrsas_instance *instance)
 	 * this frame - since we don't need any SGL - we use SGL's space as
 	 * queue info structure
 	 */
-	cmd = get_mfi_pkt(instance);
-	cmd->retry_count_for_ocr = 0;
+	cmd = *cmd_ptr;
+
 
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
@@ -2613,23 +3603,88 @@ init_mfi(struct mrsas_instance *instance)
 	ddi_put32(cmd->frame_dma_obj.acc_handle,
 	    &init_frame->queue_info_new_phys_addr_hi, 0);
 
+
+	/* fill driver version information */
+	fill_up_drv_ver(&drv_ver_info);
+
+	/* allocate the driver version data transfer buffer */
+	instance->drv_ver_dma_obj.size = sizeof (drv_ver_info.drv_ver);
+	instance->drv_ver_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_align = 1;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->drv_ver_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		con_log(CL_ANN, (CE_WARN,
+		    "init_mfi : Could not allocate driver version buffer."));
+		return (DDI_FAILURE);
+	}
+	/* copy driver version to dma buffer */
+	(void) memset(instance->drv_ver_dma_obj.buffer, 0,
+	    sizeof (drv_ver_info.drv_ver));
+	ddi_rep_put8(cmd->frame_dma_obj.acc_handle,
+	    (uint8_t *)drv_ver_info.drv_ver,
+	    (uint8_t *)instance->drv_ver_dma_obj.buffer,
+	    sizeof (drv_ver_info.drv_ver), DDI_DEV_AUTOINCR);
+
+
+	/* copy driver version physical address to init frame */
+	ddi_put64(cmd->frame_dma_obj.acc_handle, &init_frame->driverversion,
+	    instance->drv_ver_dma_obj.dma_cookie[0].dmac_address);
+
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
 	    sizeof (struct mrsas_init_queue_info));
 
 	cmd->frame_count = 1;
 
-	/* issue the init frame in polled mode */
+	*cmd_ptr = cmd;
+
+	return (DDI_SUCCESS);
+}
+
+
+/*
+ * mrsas_init_adapter_ppc - Initialize MFI interface adapter.
+ */
+int
+mrsas_init_adapter_ppc(struct mrsas_instance *instance)
+{
+	struct mrsas_cmd		*cmd;
+
+	/*
+	 * allocate memory for mfi adapter(cmd pool, individual commands, mfi
+	 * frames etc
+	 */
+	if (alloc_space_for_mfi(instance) != DDI_SUCCESS) {
+		con_log(CL_ANN, (CE_NOTE,
+		    "Error, failed to allocate memory for MFI adapter"));
+		return (DDI_FAILURE);
+	}
+
+	/* Build INIT command */
+	cmd = get_mfi_pkt(instance);
+
+	if (mrsas_build_init_cmd(instance, &cmd) != DDI_SUCCESS) {
+		con_log(CL_ANN,
+		    (CE_NOTE, "Error, failed to build INIT command"));
+
+		goto fail_undo_alloc_mfi_space;
+	}
+
+	/*
+	 * Disable interrupt before sending init frame ( see linux driver code)
+	 * send INIT MFI frame in polled mode
+	 */
 	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
 		con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
-		return_mfi_pkt(instance, cmd);
 		goto fail_fw_init;
 	}
 
-	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) {
-		return_mfi_pkt(instance, cmd);
+	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS)
 		goto fail_fw_init;
-	}
-	return_mfi_pkt(instance, cmd);
+	/* return_mfi_pkt(instance, cmd); */ /* XXX KEBE ASKS, inherit? */
 
 	if (ctio_enable &&
 	    (instance->func_ptr->read_fw_status_reg(instance) & 0x04000000)) {
@@ -2639,8 +3694,67 @@ init_mfi(struct mrsas_instance *instance)
 		instance->flag_ieee = 0;
 	}
 
-	instance->disable_online_ctrl_reset = 0;
+	instance->unroll.alloc_space_mfi = 1;
+	instance->unroll.verBuff = 1;
+
+	return (DDI_SUCCESS);
+
+
+fail_fw_init:
+	(void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj);
+
+fail_undo_alloc_mfi_space:
+	return_mfi_pkt(instance, cmd);
+	free_space_for_mfi(instance);
+
+	return (DDI_FAILURE);
+
+}
+
+/*
+ * mrsas_init_adapter - Initialize adapter.
+ */
+int
+mrsas_init_adapter(struct mrsas_instance *instance)
+{
+	struct mrsas_ctrl_info		ctrl_info;
+
+
+	/* we expect the FW state to be READY */
+	if (mfi_state_transition_to_ready(instance)) {
+		con_log(CL_ANN, (CE_WARN, "mr_sas: F/W is not ready"));
+		return (DDI_FAILURE);
+	}
+
+	/* get various operational parameters from status register */
+	instance->max_num_sge =
+	    (instance->func_ptr->read_fw_status_reg(instance) &
+	    0xFF0000) >> 0x10;
+	instance->max_num_sge =
+	    (instance->max_num_sge > MRSAS_MAX_SGE_CNT) ?
+	    MRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+	/*
+	 * Reduce the max supported cmds by 1. This is to ensure that the
+	 * reply_q_sz (1 more than the max cmd that driver may send)
+	 * does not exceed max cmds that the FW can support
+	 */
+	instance->max_fw_cmds =
+	    instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+	instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+
+
+	/* Initialize adapter */
+	if (instance->func_ptr->init_adapter(instance) != DDI_SUCCESS) {
+		con_log(CL_ANN,
+		    (CE_WARN, "mr_sas: could not initialize adapter"));
+		return (DDI_FAILURE);
+	}
+
 	/* gather misc FW related information */
+	instance->disable_online_ctrl_reset = 0;
+
 	if (!get_ctrl_info(instance, &ctrl_info)) {
 		instance->max_sectors_per_req = ctrl_info.max_request_size;
 		con_log(CL_ANN1, (CE_NOTE,
@@ -2651,28 +3765,21 @@ init_mfi(struct mrsas_instance *instance)
 		    PAGESIZE / 512;
 	}
 
-	if (ctrl_info.properties.on_off_properties & DISABLE_OCR_PROP_FLAG)
+	if (ctrl_info.properties.on_off_properties & DISABLE_OCR_PROP_FLAG) {
 		instance->disable_online_ctrl_reset = 1;
+		con_log(CL_ANN1,
+		    (CE_NOTE, "Disable online control Flag is set\n"));
+	} else {
+		con_log(CL_ANN1,
+		    (CE_NOTE, "Disable online control Flag is not set\n"));
+	}
 
 	return (DDI_SUCCESS);
 
-fail_fw_init:
-fail_alloc_fw_space:
-
-	free_space_for_mfi(instance);
-
-fail_ready_state:
-	ddi_regs_map_free(&instance->regmap_handle);
-
-fail_mfi_reg_setup:
-	return (DDI_FAILURE);
 }
 
 
 
-
-
-
 static int
 mrsas_issue_init_mfi(struct mrsas_instance *instance)
 {
@@ -2691,7 +3798,7 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance)
 	cmd = get_mfi_app_pkt(instance);
 
 	if (!cmd) {
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_WARN,
 		    "mrsas_issue_init_mfi: get_pkt failed\n"));
 		return (DDI_FAILURE);
 	}
@@ -2753,8 +3860,15 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance)
 		return_mfi_app_pkt(instance, cmd);
 		return (DDI_FAILURE);
 	}
+
+	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) {
+		return_mfi_pkt(instance, cmd);
+		return (DDI_FAILURE);
+	}
+
 	return_mfi_app_pkt(instance, cmd);
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_issue_init_mfi: Done"));
+	con_log(CL_ANN1, (CE_CONT, "mrsas_issue_init_mfi: Done"));
+
 	return (DDI_SUCCESS);
 }
 /*
@@ -2762,31 +3876,32 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance)
  *
  * @reg_set			: MFI register set
  */
-static int
+int
 mfi_state_transition_to_ready(struct mrsas_instance *instance)
 {
 	int		i;
 	uint8_t		max_wait;
-	uint32_t	fw_ctrl;
+	uint32_t	fw_ctrl = 0;
 	uint32_t	fw_state;
 	uint32_t	cur_state;
 	uint32_t	cur_abs_reg_val;
 	uint32_t	prev_abs_reg_val;
+	uint32_t	status;
 
 	cur_abs_reg_val =
 	    instance->func_ptr->read_fw_status_reg(instance);
 	fw_state =
 	    cur_abs_reg_val & MFI_STATE_MASK;
-	con_log(CL_ANN1, (CE_NOTE,
+	con_log(CL_ANN1, (CE_CONT,
 	    "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
 
 	while (fw_state != MFI_STATE_READY) {
-		con_log(CL_ANN, (CE_NOTE,
+		con_log(CL_ANN, (CE_CONT,
 		    "mfi_state_transition_to_ready:FW state%x", fw_state));
 
 		switch (fw_state) {
 		case MFI_STATE_FAULT:
-			con_log(CL_ANN1, (CE_NOTE,
+			con_log(CL_ANN, (CE_NOTE,
 			    "mr_sas: FW in FAULT state!!"));
 
 			return (ENODEV);
@@ -2800,10 +3915,14 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 			 * to be set
 			 */
 			/* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
-			WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
-			    MFI_INIT_HOTPLUG, instance);
-
-			max_wait	= 2;
+			if (!instance->tbolt) {
+				WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+				    MFI_INIT_HOTPLUG, instance);
+			} else {
+				WR_RESERVED0_REGISTER(MFI_INIT_CLEAR_HANDSHAKE |
+				    MFI_INIT_HOTPLUG, instance);
+			}
+			max_wait	= (instance->tbolt == 1) ? 180 : 2;
 			cur_state	= MFI_STATE_WAIT_HANDSHAKE;
 			break;
 		case MFI_STATE_BOOT_MESSAGE_PENDING:
@@ -2815,9 +3934,13 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 			 * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
 			 * to be set
 			 */
-			WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
-
-			max_wait	= 10;
+			if (!instance->tbolt) {
+				WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+			} else {
+				WR_RESERVED0_REGISTER(MFI_INIT_HOTPLUG,
+				    instance);
+			}
+			max_wait	= (instance->tbolt == 1) ? 180 : 10;
 			cur_state	= MFI_STATE_BOOT_MESSAGE_PENDING;
 			break;
 		case MFI_STATE_OPERATIONAL:
@@ -2831,26 +3954,46 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 			 * to be set
 			 */
 			/* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
-			WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+			if (!instance->tbolt) {
+				WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+			} else {
+				WR_RESERVED0_REGISTER(MFI_RESET_FLAGS,
+				    instance);
+
+				for (i = 0; i < (10 * 1000); i++) {
+					status =
+					    RD_RESERVED0_REGISTER(instance);
+					if (status & 1) {
+						delay(1 *
+						    drv_usectohz(MILLISEC));
+					} else {
+						break;
+					}
+				}
 
-			max_wait	= 10;
+			}
+			max_wait	= (instance->tbolt == 1) ? 180 : 10;
 			cur_state	= MFI_STATE_OPERATIONAL;
 			break;
 		case MFI_STATE_UNDEFINED:
 			/* this state should not last for more than 2 seconds */
 			con_log(CL_ANN1, (CE_NOTE, "FW state undefined"));
 
-			max_wait	= 2;
+			max_wait	= (instance->tbolt == 1) ? 180 : 2;
 			cur_state	= MFI_STATE_UNDEFINED;
 			break;
 		case MFI_STATE_BB_INIT:
-			max_wait	= 2;
+			max_wait	= (instance->tbolt == 1) ? 180 : 2;
 			cur_state	= MFI_STATE_BB_INIT;
 			break;
 		case MFI_STATE_FW_INIT:
-			max_wait	= 2;
+			max_wait	= (instance->tbolt == 1) ? 180 : 2;
 			cur_state	= MFI_STATE_FW_INIT;
 			break;
+		case MFI_STATE_FW_INIT_2:
+			max_wait	= 180;
+			cur_state	= MFI_STATE_FW_INIT_2;
+			break;
 		case MFI_STATE_DEVICE_SCAN:
 			max_wait	= 180;
 			cur_state	= MFI_STATE_DEVICE_SCAN;
@@ -2858,6 +4001,10 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 			con_log(CL_NONE, (CE_NOTE,
 			    "Device scan in progress ...\n"));
 			break;
+		case MFI_STATE_FLUSH_CACHE:
+			max_wait	= 180;
+			cur_state	= MFI_STATE_FLUSH_CACHE;
+			break;
 		default:
 			con_log(CL_ANN1, (CE_NOTE,
 			    "mr_sas: Unknown state 0x%x", fw_state));
@@ -2885,17 +4032,19 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 
 		/* return error if fw_state hasn't changed after max_wait */
 		if (fw_state == cur_state) {
-			con_log(CL_ANN1, (CE_NOTE,
+			con_log(CL_ANN1, (CE_WARN,
 			    "FW state hasn't changed in %d secs", max_wait));
 			return (ENODEV);
 		}
 	};
 
-	fw_ctrl = RD_IB_DOORBELL(instance);
-
-	con_log(CL_ANN1, (CE_NOTE,
-	    "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+	if (!instance->tbolt) {
+		fw_ctrl = RD_IB_DOORBELL(instance);
+		con_log(CL_ANN1, (CE_CONT,
+		    "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+	}
 
+#if 0	/* XXX KEBE ASKS, remove and use like pre-2208? */
 	/*
 	 * Write 0xF to the doorbell register to do the following.
 	 * - Abort all outstanding commands (bit 0).
@@ -2904,11 +4053,14 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance)
 	 * - Set to release FW to continue running (i.e. BIOS handshake
 	 *   (bit 3).
 	 */
-	WR_IB_DOORBELL(0xF, instance);
-
+	if (!instance->tbolt) {
+		WR_IB_DOORBELL(0xF, instance);
+	}
+#endif
 	if (mrsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
 		return (ENODEV);
 	}
+
 	return (DDI_SUCCESS);
 }
 
@@ -2925,7 +4077,11 @@ get_seq_num(struct mrsas_instance *instance,
 	struct mrsas_cmd		*cmd;
 	struct mrsas_dcmd_frame		*dcmd;
 	struct mrsas_evt_log_info *eli_tmp;
-	cmd = get_mfi_pkt(instance);
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 
 	if (!cmd) {
 		cmn_err(CE_WARN, "mr_sas: failed to get a cmd");
@@ -2933,13 +4089,13 @@ get_seq_num(struct mrsas_instance *instance,
 		    instance->fw_outstanding, uint16_t, instance->max_fw_cmds);
 		return (ENOMEM);
 	}
-	cmd->retry_count_for_ocr = 0;
+
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
 	    cmd->index);
 
-	dcmd	= &cmd->frame->dcmd;
+	dcmd = &cmd->frame->dcmd;
 
 	/* allocate the data transfer buffer */
 	dcmd_dma_obj.size = sizeof (struct mrsas_evt_log_info);
@@ -2951,8 +4107,8 @@ get_seq_num(struct mrsas_instance *instance,
 
 	if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj,
 	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
-		con_log(CL_ANN, (CE_WARN,
-		    "get_seq_num: could not allocate data transfer buffer."));
+		cmn_err(CE_WARN,
+		    "get_seq_num: could not allocate data transfer buffer.");
 		return (DDI_FAILURE);
 	}
 
@@ -2979,6 +4135,10 @@ get_seq_num(struct mrsas_instance *instance,
 	cmd->sync_cmd = MRSAS_TRUE;
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
 		cmn_err(CE_WARN, "get_seq_num: "
 		    "failed to issue MRSAS_DCMD_CTRL_EVENT_GET_INFO");
@@ -2993,12 +4153,12 @@ get_seq_num(struct mrsas_instance *instance,
 	if (mrsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
 		ret = DDI_FAILURE;
 
-	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) {
-		ret = DDI_FAILURE;
+	if (instance->tbolt) {
+		return_raid_msg_mfi_pkt(instance, cmd);
+	} else {
+		return_mfi_pkt(instance, cmd);
 	}
 
-	return_mfi_pkt(instance, cmd);
-
 	return (ret);
 }
 
@@ -3034,6 +4194,7 @@ start_mfi_aen(struct mrsas_instance *instance)
 		return (-1);
 	}
 
+
 	return (ret);
 }
 
@@ -3045,9 +4206,11 @@ flush_cache(struct mrsas_instance *instance)
 {
 	struct mrsas_cmd		*cmd = NULL;
 	struct mrsas_dcmd_frame		*dcmd;
-	uint32_t	max_cmd = instance->max_fw_cmds;
-
-	cmd = instance->cmd_list[max_cmd];
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 
 	if (!cmd) {
 		con_log(CL_ANN1, (CE_WARN,
@@ -3056,7 +4219,7 @@ flush_cache(struct mrsas_instance *instance)
 		    instance->fw_outstanding, uint16_t, instance->max_fw_cmds);
 		return;
 	}
-	cmd->retry_count_for_ocr = 0;
+
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
@@ -3080,11 +4243,21 @@ flush_cache(struct mrsas_instance *instance)
 
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
 		con_log(CL_ANN1, (CE_WARN,
 	    "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
 	}
-	con_log(CL_ANN1, (CE_NOTE, "flush_cache done"));
+	con_log(CL_ANN1, (CE_CONT, "flush_cache done"));
+	if (instance->tbolt) {
+		return_raid_msg_mfi_pkt(instance, cmd);
+	} else {
+		return_mfi_pkt(instance, cmd);
+	}
+
 }
 
 /*
@@ -3093,7 +4266,7 @@ flush_cache(struct mrsas_instance *instance)
  * @cmd:			Command to be completed
  *
  */
-static void
+void
 service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 {
 	uint32_t	seq_num;
@@ -3101,12 +4274,16 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 	    (struct mrsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
 	int		rval = 0;
 	int		tgt = 0;
+	uint8_t		dtype;
+#ifdef PDSUPPORT
+	mrsas_pd_address_t	*pd_addr;
+#endif
 	ddi_acc_handle_t		acc_handle;
 
-	acc_handle = cmd->frame_dma_obj.acc_handle;
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 
+	acc_handle = cmd->frame_dma_obj.acc_handle;
 	cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
-
 	if (cmd->cmd_status == ENODATA) {
 		cmd->cmd_status = 0;
 	}
@@ -3125,7 +4302,7 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 	 * Check for any ld devices that has changed state. i.e. online
 	 * or offline.
 	 */
-	con_log(CL_ANN1, (CE_NOTE,
+	con_log(CL_ANN1, (CE_CONT,
 	    "AEN: code = %x class = %x locale = %x args = %x",
 	    ddi_get32(acc_handle, &evt_detail->code),
 	    evt_detail->cl.members.class,
@@ -3136,6 +4313,10 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 	case MR_EVT_CFG_CLEARED: {
 		for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
 			if (instance->mr_ld_list[tgt].dip != NULL) {
+				mutex_enter(&instance->config_dev_mtx);
+				instance->mr_ld_list[tgt].flag =
+				    (uint8_t)~MRDRV_TGT_VALID;
+				mutex_exit(&instance->config_dev_mtx);
 				rval = mrsas_service_evt(instance, tgt, 0,
 				    MRSAS_EVT_UNCONFIG_TGT, NULL);
 				con_log(CL_ANN1, (CE_WARN,
@@ -3147,6 +4328,10 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 	}
 
 	case MR_EVT_LD_DELETED: {
+		tgt = ddi_get16(acc_handle, &evt_detail->args.ld.target_id);
+		mutex_enter(&instance->config_dev_mtx);
+		instance->mr_ld_list[tgt].flag = (uint8_t)~MRDRV_TGT_VALID;
+		mutex_exit(&instance->config_dev_mtx);
 		rval = mrsas_service_evt(instance,
 		    ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
 		    MRSAS_EVT_UNCONFIG_TGT, NULL);
@@ -3167,6 +4352,86 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 		    ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
 		break;
 	} /* End of MR_EVT_LD_CREATED */
+
+#ifdef PDSUPPORT
+	case MR_EVT_PD_REMOVED_EXT: {
+		if (instance->tbolt) {
+			pd_addr = &evt_detail->args.pd_addr;
+			dtype = pd_addr->scsi_dev_type;
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    " MR_EVT_PD_REMOVED_EXT: dtype = %x,"
+			    " arg_type = %d ", dtype, evt_detail->arg_type));
+			tgt = ddi_get16(acc_handle,
+			    &evt_detail->args.pd.device_id);
+			mutex_enter(&instance->config_dev_mtx);
+			instance->mr_tbolt_pd_list[tgt].flag =
+			    (uint8_t)~MRDRV_TGT_VALID;
+			mutex_exit(&instance->config_dev_mtx);
+			rval = mrsas_service_evt(instance, ddi_get16(
+			    acc_handle, &evt_detail->args.pd.device_id),
+			    1, MRSAS_EVT_UNCONFIG_TGT, NULL);
+			con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_REMOVED:"
+			    "rval = %d tgt id = %d ", rval,
+			    ddi_get16(acc_handle,
+			    &evt_detail->args.pd.device_id)));
+		}
+		break;
+	} /* End of MR_EVT_PD_REMOVED_EXT */
+
+	case MR_EVT_PD_INSERTED_EXT: {
+		if (instance->tbolt) {
+			rval = mrsas_service_evt(instance,
+			    ddi_get16(acc_handle,
+			    &evt_detail->args.pd.device_id),
+			    1, MRSAS_EVT_CONFIG_TGT, NULL);
+			con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_INSERTEDi_EXT:"
+			    "rval = %d tgt id = %d ", rval,
+			    ddi_get16(acc_handle,
+			    &evt_detail->args.pd.device_id)));
+		}
+		break;
+	} /* End of MR_EVT_PD_INSERTED_EXT */
+
+	case MR_EVT_PD_STATE_CHANGE: {
+		if (instance->tbolt) {
+			tgt = ddi_get16(acc_handle,
+			    &evt_detail->args.pd.device_id);
+			if ((evt_detail->args.pd_state.prevState ==
+			    PD_SYSTEM) &&
+			    (evt_detail->args.pd_state.newState != PD_SYSTEM)) {
+				mutex_enter(&instance->config_dev_mtx);
+				instance->mr_tbolt_pd_list[tgt].flag =
+				    (uint8_t)~MRDRV_TGT_VALID;
+				mutex_exit(&instance->config_dev_mtx);
+				rval = mrsas_service_evt(instance,
+				    ddi_get16(acc_handle,
+				    &evt_detail->args.pd.device_id),
+				    1, MRSAS_EVT_UNCONFIG_TGT, NULL);
+				con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_REMOVED:"
+				    "rval = %d tgt id = %d ", rval,
+				    ddi_get16(acc_handle,
+				    &evt_detail->args.pd.device_id)));
+				break;
+			}
+			if ((evt_detail->args.pd_state.prevState
+			    == UNCONFIGURED_GOOD) &&
+			    (evt_detail->args.pd_state.newState == PD_SYSTEM)) {
+				rval = mrsas_service_evt(instance,
+				    ddi_get16(acc_handle,
+				    &evt_detail->args.pd.device_id),
+				    1, MRSAS_EVT_CONFIG_TGT, NULL);
+				con_log(CL_ANN1, (CE_WARN,
+				    "mr_sas: PD_INSERTED: rval = %d "
+				    " tgt id = %d ", rval,
+				    ddi_get16(acc_handle,
+				    &evt_detail->args.pd.device_id)));
+				break;
+			}
+		}
+		break;
+	}
+#endif
+
 	} /* End of Main Switch */
 
 	/* get copy of seq_num and class/locale for re-registration */
@@ -3182,6 +4447,9 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
 
 	cmd->frame_count = 1;
 
+	cmd->retry_count_for_ocr = 0;
+	cmd->drv_pkt_time = 0;
+
 	/* Issue the aen registration frame */
 	instance->func_ptr->issue_cmd(cmd, instance);
 }
@@ -3204,14 +4472,16 @@ complete_cmd_in_sync_mode(struct mrsas_instance *instance,
 
 	cmd->sync_cmd = MRSAS_FALSE;
 
-	if (cmd->cmd_status == ENODATA) {
-		cmd->cmd_status = 0;
-	}
-
 	con_log(CL_ANN1, (CE_NOTE, "complete_cmd_in_sync_mode called %p \n",
 	    (void *)cmd));
 
+	mutex_enter(&instance->int_cmd_mtx);
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
 	cv_broadcast(&instance->int_cmd_cv);
+	mutex_exit(&instance->int_cmd_mtx);
+
 }
 
 /*
@@ -3229,20 +4499,22 @@ mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *instance)
 	cur_abs_reg_val =  instance->func_ptr->read_fw_status_reg(instance);
 	fw_state = cur_abs_reg_val & MFI_STATE_MASK;
 	if (fw_state == MFI_STATE_FAULT) {
-
 		if (instance->disable_online_ctrl_reset == 1) {
-		con_log(CL_ANN1, (CE_NOTE,
-		    "mrsas_initiate_ocr_if_fw_is_faulty: "
-		    "FW in Fault state, detected in ISR: "
-		    "FW doesn't support ocr "));
-		return (ADAPTER_RESET_NOT_REQUIRED);
+			cmn_err(CE_WARN,
+			    "mrsas_initiate_ocr_if_fw_is_faulty: "
+			    "FW in Fault state, detected in ISR: "
+			    "FW doesn't support ocr ");
+
+			return (ADAPTER_RESET_NOT_REQUIRED);
 		} else {
-		con_log(CL_ANN1, (CE_NOTE,
-		    "mrsas_initiate_ocr_if_fw_is_faulty: "
-		    "FW in Fault state, detected in ISR: FW supports ocr "));
+			con_log(CL_ANN, (CE_NOTE,
+			    "mrsas_initiate_ocr_if_fw_is_faulty: FW in Fault "
+			    "state, detected in ISR: FW supports ocr "));
+
 			return (ADAPTER_RESET_REQUIRED);
 		}
 	}
+
 	return (ADAPTER_RESET_NOT_REQUIRED);
 }
 
@@ -3264,7 +4536,7 @@ mrsas_softintr(struct mrsas_instance *instance)
 	struct mrsas_header	*hdr;
 	struct scsi_arq_status	*arqstat;
 
-	con_log(CL_ANN1, (CE_CONT, "mrsas_softintr called"));
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_softintr() called."));
 
 	ASSERT(instance);
 
@@ -3341,7 +4613,7 @@ mrsas_softintr(struct mrsas_instance *instance)
 			    | STATE_GOT_TARGET | STATE_SENT_CMD
 			    | STATE_XFERRED_DATA | STATE_GOT_STATUS;
 
-			con_log(CL_ANN1, (CE_CONT,
+			con_log(CL_ANN, (CE_CONT,
 			    "CDB[0] = %x completed for %s: size %lx context %x",
 			    pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
 			    acmd->cmd_dmacount, hdr->context));
@@ -3394,17 +4666,15 @@ mrsas_softintr(struct mrsas_instance *instance)
 
 				break;
 			case MFI_STAT_SCSI_DONE_WITH_ERROR:
-				con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+				con_log(CL_ANN, (CE_CONT, "scsi_done error"));
 
 				pkt->pkt_reason	= CMD_CMPLT;
 				((struct scsi_status *)
 				    pkt->pkt_scbp)->sts_chk = 1;
 
 				if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
-
 					con_log(CL_ANN,
 					    (CE_WARN, "TEST_UNIT_READY fail"));
-
 				} else {
 					pkt->pkt_state |= STATE_ARQ_DONE;
 					arqstat = (void *)(pkt->pkt_scbp);
@@ -3421,14 +4691,13 @@ mrsas_softintr(struct mrsas_instance *instance)
 					    (uint8_t *)
 					    &(arqstat->sts_sensedata),
 					    cmd->sense,
-					    acmd->cmd_scblen -
-					    offsetof(struct scsi_arq_status,
-					    sts_sensedata), DDI_DEV_AUTOINCR);
-			}
+					    sizeof (struct scsi_extended_sense),
+					    DDI_DEV_AUTOINCR);
+				}
 				break;
 			case MFI_STAT_LD_OFFLINE:
 			case MFI_STAT_DEVICE_NOT_FOUND:
-				con_log(CL_ANN1, (CE_CONT,
+				con_log(CL_ANN, (CE_CONT,
 				"mrsas_softintr:device not found error"));
 				pkt->pkt_reason	= CMD_DEV_GONE;
 				pkt->pkt_statistics  = STAT_DISCON;
@@ -3488,19 +4757,22 @@ mrsas_softintr(struct mrsas_instance *instance)
 			if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
 			    pkt->pkt_comp) {
 
-				con_log(CL_ANN1, (CE_NOTE, "mrsas_softintr: "
+				con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_softintr: "
 				    "posting to scsa cmd %p index %x pkt %p "
 				    "time %llx", (void *)cmd, cmd->index,
 				    (void *)pkt, gethrtime()));
 				(*pkt->pkt_comp)(pkt);
 
 			}
+
 			return_mfi_pkt(instance, cmd);
 			break;
+
 		case MFI_CMD_OP_SMP:
 		case MFI_CMD_OP_STP:
 			complete_cmd_in_sync_mode(instance, cmd);
 			break;
+
 		case MFI_CMD_OP_DCMD:
 			/* see if got an event notification */
 			if (ddi_get32(cmd->frame_dma_obj.acc_handle,
@@ -3521,14 +4793,16 @@ mrsas_softintr(struct mrsas_instance *instance)
 			}
 
 			break;
+
 		case MFI_CMD_OP_ABORT:
-			con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+			con_log(CL_ANN, (CE_NOTE, "MFI_CMD_OP_ABORT complete"));
 			/*
 			 * MFI_CMD_OP_ABORT successfully completed
 			 * in the synchronous mode
 			 */
 			complete_cmd_in_sync_mode(instance, cmd);
 			break;
+
 		default:
 			mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
 			ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
@@ -3563,7 +4837,7 @@ mrsas_softintr(struct mrsas_instance *instance)
  *
  * Allocate the memory and other resources for an dma object.
  */
-static int
+int
 mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj,
     uchar_t endian_flags)
 {
@@ -3610,6 +4884,11 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj,
 
 		return (-1);
 	}
+	if (obj->dma_handle == NULL) {
+		/* XXX KEBE ASKS --> fm_service_impact()? */
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+		return (-1);
+	}
 
 	if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
 	    obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
@@ -3622,6 +4901,14 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj,
 
 		return (-1);
 	}
+	if (obj->acc_handle == NULL) {
+		/* XXX KEBE ASKS --> fm_service_impact()? */
+		ddi_dma_mem_free(&obj->acc_handle);
+		ddi_dma_free_handle(&obj->dma_handle);
+
+		con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+		return (-1);
+	}
 
 	if (mrsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
 		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
@@ -3642,10 +4929,19 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj,
  * De-allocate the memory and other resources for an dma object, which must
  * have been alloated by a previous call to mrsas_alloc_dma_obj()
  */
-static int
+/* ARGSUSED */
+int
 mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj)
 {
 
+	if ((obj.dma_handle == NULL) || (obj.acc_handle == NULL)) {
+		return (DDI_SUCCESS);
+	}
+
+	/*
+	 * NOTE: These check-handle functions fail if *_handle == NULL, but
+	 * this function succeeds because of the previous check.
+	 */
 	if (mrsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
 		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
 		return (DDI_FAILURE);
@@ -3659,7 +4955,7 @@ mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj)
 	(void) ddi_dma_unbind_handle(obj.dma_handle);
 	ddi_dma_mem_free(&obj.acc_handle);
 	ddi_dma_free_handle(&obj.dma_handle);
-
+	obj.acc_handle = NULL;
 	return (DDI_SUCCESS);
 }
 
@@ -3669,7 +4965,7 @@ mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj)
  *
  * Allocate dma resources for a new scsi command
  */
-static int
+int
 mrsas_dma_alloc(struct mrsas_instance *instance, struct scsi_pkt *pkt,
     struct buf *bp, int flags, int (*callback)())
 {
@@ -3705,6 +5001,13 @@ mrsas_dma_alloc(struct mrsas_instance *instance, struct scsi_pkt *pkt,
 
 	tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
 	tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+	if (instance->tbolt) {
+		/* OCR-RESET FIX */
+		tmp_dma_attr.dma_attr_count_max =
+		    (U64)mrsas_tbolt_max_cap_maxxfer;  /* limit to 256K */
+		tmp_dma_attr.dma_attr_maxxfer =
+		    (U64)mrsas_tbolt_max_cap_maxxfer;  /* limit to 256K */
+	}
 
 	if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
 	    cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
@@ -3816,7 +5119,7 @@ no_dma_cookies:
  * move dma resources to next dma window
  *
  */
-static int
+int
 mrsas_dma_move(struct mrsas_instance *instance, struct scsi_pkt *pkt,
     struct buf *bp)
 {
@@ -3886,14 +5189,15 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 {
 	uint16_t	flags = 0;
 	uint32_t	i;
-	uint32_t 	context;
+	uint32_t	context;
 	uint32_t	sge_bytes;
+	uint32_t	tmp_data_xfer_len;
 	ddi_acc_handle_t acc_handle;
 	struct mrsas_cmd		*cmd;
 	struct mrsas_sge64		*mfi_sgl;
 	struct mrsas_sge_ieee		*mfi_sgl_ieee;
 	struct scsa_cmd			*acmd = PKT2CMD(pkt);
-	struct mrsas_pthru_frame 	*pthru;
+	struct mrsas_pthru_frame	*pthru;
 	struct mrsas_io_frame		*ldio;
 
 	/* find out if this is logical or physical drive command.  */
@@ -3908,8 +5212,6 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 		return (NULL);
 	}
 
-	cmd->retry_count_for_ocr = 0;
-
 	acc_handle = cmd->frame_dma_obj.acc_handle;
 
 	/* Clear the frame buffer and assign back the context id */
@@ -3951,7 +5253,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 
 	/*
 	 * case SCMD_SYNCHRONIZE_CACHE:
-	 * 	flush_cache(instance);
+	 *	flush_cache(instance);
 	 *	return_mfi_pkt(instance, cmd);
 	 *	*cmd_done = 1;
 	 *
@@ -3962,6 +5264,10 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 	case SCMD_WRITE:
 	case SCMD_READ_G1:
 	case SCMD_WRITE_G1:
+	case SCMD_READ_G4:
+	case SCMD_WRITE_G4:
+	case SCMD_READ_G5:
+	case SCMD_WRITE_G5:
 		if (acmd->islogical) {
 			ldio = (struct mrsas_io_frame *)cmd->frame;
 
@@ -4001,6 +5307,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 			context = ddi_get32(acc_handle, &ldio->context);
 
 			if (acmd->cmd_cdblen == CDB_GROUP0) {
+				/* 6-byte cdb */
 				ddi_put32(acc_handle, &ldio->lba_count, (
 				    (uint16_t)(pkt->pkt_cdbp[4])));
 
@@ -4010,6 +5317,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 				    ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
 				    << 16)));
 			} else if (acmd->cmd_cdblen == CDB_GROUP1) {
+				/* 10-byte cdb */
 				ddi_put32(acc_handle, &ldio->lba_count, (
 				    ((uint16_t)(pkt->pkt_cdbp[8])) |
 				    ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
@@ -4019,24 +5327,26 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
 				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
 				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
-			} else if (acmd->cmd_cdblen == CDB_GROUP2) {
+			} else if (acmd->cmd_cdblen == CDB_GROUP5) {
+				/* 12-byte cdb */
 				ddi_put32(acc_handle, &ldio->lba_count, (
-				    ((uint16_t)(pkt->pkt_cdbp[9])) |
-				    ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
-				    ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
-				    ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
 
 				ddi_put32(acc_handle, &ldio->start_lba_lo, (
 				    ((uint32_t)(pkt->pkt_cdbp[5])) |
 				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
 				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
 				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
-			} else if (acmd->cmd_cdblen == CDB_GROUP3) {
+			} else if (acmd->cmd_cdblen == CDB_GROUP4) {
+				/* 16-byte cdb */
 				ddi_put32(acc_handle, &ldio->lba_count, (
-				    ((uint16_t)(pkt->pkt_cdbp[13])) |
-				    ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
-				    ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
-				    ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+				    ((uint32_t)(pkt->pkt_cdbp[13])) |
+				    ((uint32_t)(pkt->pkt_cdbp[12]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[11]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[10]) << 24)));
 
 				ddi_put32(acc_handle, &ldio->start_lba_lo, (
 				    ((uint32_t)(pkt->pkt_cdbp[9])) |
@@ -4044,7 +5354,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
 				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
 
-				ddi_put32(acc_handle, &ldio->start_lba_lo, (
+				ddi_put32(acc_handle, &ldio->start_lba_hi, (
 				    ((uint32_t)(pkt->pkt_cdbp[5])) |
 				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
 				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
@@ -4090,8 +5400,12 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 		ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
 		ddi_put16(acc_handle, &pthru->timeout, 0);
 		ddi_put16(acc_handle, &pthru->flags, flags);
+		tmp_data_xfer_len = 0;
+		for (i = 0; i < acmd->cmd_cookiecnt; i++) {
+			tmp_data_xfer_len += acmd->cmd_dmacookies[i].dmac_size;
+		}
 		ddi_put32(acc_handle, &pthru->data_xfer_len,
-		    acmd->cmd_dmacount);
+		    tmp_data_xfer_len);
 		ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
 		if (instance->flag_ieee) {
 			mfi_sgl_ieee = (struct mrsas_sge_ieee *)&pthru->sgl;
@@ -4142,7 +5456,16 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
 
 	return (cmd);
 }
+
 #ifndef __sparc
+/*
+ * wait_for_outstanding -	Wait for all outstanding cmds
+ * @instance:				Adapter soft state
+ *
+ * This function waits for upto MRDRV_RESET_WAIT_TIME seconds for FW to
+ * complete all its outstanding commands. Returns error if one or more IOs
+ * are pending after this time period.
+ */
 static int
 wait_for_outstanding(struct mrsas_instance *instance)
 {
@@ -4153,6 +5476,7 @@ wait_for_outstanding(struct mrsas_instance *instance)
 		if (!instance->fw_outstanding) {
 			break;
 		}
+
 		drv_usecwait(MILLISEC); /* wait for 1000 usecs */;
 	}
 
@@ -4162,7 +5486,8 @@ wait_for_outstanding(struct mrsas_instance *instance)
 
 	return (0);
 }
-#endif  /* __sparc */
+#endif	/* __sparc */
+
 /*
  * issue_mfi_pthru
  */
@@ -4173,6 +5498,7 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	void		*ubuf;
 	uint32_t	kphys_addr = 0;
 	uint32_t	xferlen = 0;
+	uint32_t new_xfer_length = 0;
 	uint_t		model;
 	ddi_acc_handle_t	acc_handle = cmd->frame_dma_obj.acc_handle;
 	dma_obj_t			pthru_dma_obj;
@@ -4183,24 +5509,24 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	kpthru = (struct mrsas_pthru_frame *)&ioctl->frame[0];
 
 	if (instance->adapterresetinprogress) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: Reset flag set, "
+		con_log(CL_ANN1, (CE_WARN, "issue_mfi_pthru: Reset flag set, "
 		"returning mfi_pkt and setting TRAN_BUSY\n"));
 		return (DDI_FAILURE);
 	}
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP32"));
 
 		xferlen	= kpthru->sgl.sge32[0].length;
 
 		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
 	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP32"));
 		xferlen	= kpthru->sgl.sge32[0].length;
 		ubuf	= (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
 #else
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP64"));
 		xferlen	= kpthru->sgl.sge64[0].length;
 		ubuf	= (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
 #endif
@@ -4209,7 +5535,10 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	if (xferlen) {
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		pthru_dma_obj.size = xferlen;
+		/* pthru_dma_obj.size = xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(xferlen, new_xfer_length,
+		    PAGESIZE);
+		pthru_dma_obj.size = new_xfer_length;
 		pthru_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
@@ -4243,7 +5572,7 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	}
 
 	ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
-	ddi_put8(acc_handle, &pthru->sense_len, 0);
+	ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
 	ddi_put8(acc_handle, &pthru->cmd_status, 0);
 	ddi_put8(acc_handle, &pthru->scsi_status, 0);
 	ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
@@ -4254,8 +5583,8 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
 
 	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
-	/* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
-	ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+	pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr;
+	/* ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); */
 
 	ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
 	    pthru->cdb_len, DDI_DEV_AUTOINCR);
@@ -4267,6 +5596,10 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	cmd->sync_cmd = MRSAS_TRUE;
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
 		con_log(CL_ANN, (CE_WARN,
 		    "issue_mfi_pthru: fw_ioctl failed"));
@@ -4288,11 +5621,35 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
 	kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
 
-	con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+	con_log(CL_ANN, (CE_CONT, "issue_mfi_pthru: cmd_status %x, "
 	    "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
 	DTRACE_PROBE3(issue_pthru, uint8_t, kpthru->cmd, uint8_t,
 	    kpthru->cmd_status, uint8_t, kpthru->scsi_status);
 
+	if (kpthru->sense_len) {
+		uint_t sense_len = SENSE_LENGTH;
+		void *sense_ubuf =
+		    (void *)(ulong_t)kpthru->sense_buf_phys_addr_lo;
+		if (kpthru->sense_len <= SENSE_LENGTH) {
+			sense_len = kpthru->sense_len;
+		}
+
+		for (i = 0; i < sense_len; i++) {
+			if (ddi_copyout(
+			    (uint8_t *)cmd->sense+i,
+			    (uint8_t *)sense_ubuf+i, 1, mode)) {
+				con_log(CL_ANN, (CE_WARN,
+				    "issue_mfi_pthru : "
+				    "copy to user space failed"));
+			}
+			con_log(CL_DLEVEL1, (CE_WARN,
+			    "Copying Sense info sense_buff[%d] = 0x%X\n",
+			    i, *((uint8_t *)cmd->sense + i)));
+		}
+	}
+	(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+	    DDI_DMA_SYNC_FORDEV);
+
 	if (xferlen) {
 		/* free kernel buffer */
 		if (mrsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
@@ -4312,6 +5669,7 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	void		*ubuf;
 	uint32_t	kphys_addr = 0;
 	uint32_t	xferlen = 0;
+	uint32_t	new_xfer_length = 0;
 	uint32_t	model;
 	dma_obj_t	dcmd_dma_obj;
 	struct mrsas_dcmd_frame	*kdcmd;
@@ -4320,25 +5678,26 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	int i;
 	dcmd = &cmd->frame->dcmd;
 	kdcmd = (struct mrsas_dcmd_frame *)&ioctl->frame[0];
+
 	if (instance->adapterresetinprogress) {
-		con_log(CL_ANN1, (CE_NOTE, "Reset flag set, "
+		con_log(CL_ANN1, (CE_WARN, "Reset flag set, "
 		"returning mfi_pkt and setting TRAN_BUSY\n"));
 		return (DDI_FAILURE);
 	}
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
 
 		xferlen	= kdcmd->sgl.sge32[0].length;
 
 		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
 	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
 		xferlen	= kdcmd->sgl.sge32[0].length;
 		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
 #else
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_LP64"));
 		xferlen	= kdcmd->sgl.sge64[0].length;
 		ubuf	= (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
 #endif
@@ -4346,7 +5705,10 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	if (xferlen) {
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		dcmd_dma_obj.size = xferlen;
+		/* dcmd_dma_obj.size = xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(xferlen, new_xfer_length,
+		    PAGESIZE);
+		dcmd_dma_obj.size = new_xfer_length;
 		dcmd_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
@@ -4354,12 +5716,13 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		dcmd_dma_obj.dma_attr.dma_attr_align = 1;
 
 		/* allocate kernel buffer for DMA */
-		if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj,
-		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
-			con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
-			    "could not allocate data transfer buffer."));
-			return (DDI_FAILURE);
-		}
+			if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+			    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+				con_log(CL_ANN,
+				    (CE_WARN, "issue_mfi_dcmd: could not "
+				    "allocate data transfer buffer."));
+				return (DDI_FAILURE);
+			}
 		(void) memset(dcmd_dma_obj.buffer, 0, xferlen);
 
 		/* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
@@ -4396,6 +5759,10 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	cmd->sync_cmd = MRSAS_TRUE;
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
 		con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
 	} else {
@@ -4415,6 +5782,8 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	}
 
 	kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+	con_log(CL_ANN,
+	    (CE_CONT, "issue_mfi_dcmd: cmd_status %x", kdcmd->cmd_status));
 	DTRACE_PROBE3(issue_dcmd, uint32_t, kdcmd->opcode, uint8_t,
 	    kdcmd->cmd, uint8_t, kdcmd->cmd_status);
 
@@ -4438,6 +5807,8 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	void		*response_ubuf;
 	uint32_t	request_xferlen = 0;
 	uint32_t	response_xferlen = 0;
+	uint32_t	new_xfer_length1 = 0;
+	uint32_t	new_xfer_length2 = 0;
 	uint_t		model;
 	dma_obj_t			request_dma_obj;
 	dma_obj_t			response_dma_obj;
@@ -4455,44 +5826,44 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	ksmp = (struct mrsas_smp_frame *)&ioctl->frame[0];
 
 	if (instance->adapterresetinprogress) {
-		con_log(CL_ANN1, (CE_NOTE, "Reset flag set, "
+		con_log(CL_ANN1, (CE_WARN, "Reset flag set, "
 		"returning mfi_pkt and setting TRAN_BUSY\n"));
 		return (DDI_FAILURE);
 	}
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32"));
 
 		sge32			= &ksmp->sgl[0].sge32[0];
 		response_xferlen	= sge32[0].length;
 		request_xferlen		= sge32[1].length;
-		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		con_log(CL_ANN, (CE_CONT, "issue_mfi_smp: "
 		    "response_xferlen = %x, request_xferlen = %x",
 		    response_xferlen, request_xferlen));
 
 		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
 		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: "
 		    "response_ubuf = %p, request_ubuf = %p",
 		    response_ubuf, request_ubuf));
 	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32"));
 
 		sge32			= &ksmp->sgl[0].sge32[0];
 		response_xferlen	= sge32[0].length;
 		request_xferlen		= sge32[1].length;
-		con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+		con_log(CL_ANN, (CE_CONT, "issue_mfi_smp: "
 		    "response_xferlen = %x, request_xferlen = %x",
 		    response_xferlen, request_xferlen));
 
 		response_ubuf	= (void *)(ulong_t)sge32[0].phys_addr;
 		request_ubuf	= (void *)(ulong_t)sge32[1].phys_addr;
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: "
 		    "response_ubuf = %p, request_ubuf = %p",
 		    response_ubuf, request_ubuf));
 #else
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_LP64"));
 
 		sge64			= &ksmp->sgl[0].sge64[0];
 		response_xferlen	= sge64[0].length;
@@ -4505,7 +5876,10 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	if (request_xferlen) {
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		request_dma_obj.size = request_xferlen;
+		/* request_dma_obj.size = request_xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(request_xferlen,
+		    new_xfer_length1, PAGESIZE);
+		request_dma_obj.size = new_xfer_length1;
 		request_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
@@ -4536,7 +5910,10 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	if (response_xferlen) {
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		response_dma_obj.size = response_xferlen;
+		/* response_dma_obj.size = response_xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(response_xferlen,
+		    new_xfer_length2, PAGESIZE);
+		response_dma_obj.size = new_xfer_length2;
 		response_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
@@ -4580,7 +5957,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "issue_mfi_smp: DDI_MODEL_ILP32"));
 
 		sge32 = &smp->sgl[0].sge32[0];
@@ -4592,7 +5969,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		    request_dma_obj.dma_cookie[0].dmac_address);
 	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "issue_mfi_smp: DDI_MODEL_ILP32"));
 		sge32 = &smp->sgl[0].sge32[0];
 		ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
@@ -4602,7 +5979,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		ddi_put32(acc_handle, &sge32[1].phys_addr,
 		    request_dma_obj.dma_cookie[0].dmac_address);
 #else
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "issue_mfi_smp: DDI_MODEL_LP64"));
 		sge64 = &smp->sgl[0].sge64[0];
 		ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
@@ -4613,7 +5990,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		    request_dma_obj.dma_cookie[0].dmac_address);
 #endif
 	}
-	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+	con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp : "
 	    "smp->response_xferlen = %d, smp->request_xferlen = %d "
 	    "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
 	    ddi_get32(acc_handle, &sge32[1].length),
@@ -4622,11 +5999,15 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	cmd->sync_cmd = MRSAS_TRUE;
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
 		con_log(CL_ANN, (CE_WARN,
 		    "issue_mfi_smp: fw_ioctl failed"));
 	} else {
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "issue_mfi_smp: copy to user space"));
 
 		if (request_xferlen) {
@@ -4660,7 +6041,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 
 	ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
 	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
-	    ddi_get8(acc_handle, &smp->cmd_status)));
+	    ksmp->cmd_status));
 	DTRACE_PROBE2(issue_smp, uint8_t, ksmp->cmd, uint8_t, ksmp->cmd_status);
 
 	if (request_xferlen) {
@@ -4690,6 +6071,8 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	void		*fis_ubuf;
 	void		*data_ubuf;
 	uint32_t	fis_xferlen = 0;
+	uint32_t   new_xfer_length1 = 0;
+	uint32_t   new_xfer_length2 = 0;
 	uint32_t	data_xferlen = 0;
 	uint_t		model;
 	dma_obj_t	fis_dma_obj;
@@ -4703,24 +6086,22 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	kstp = (struct mrsas_stp_frame *)&ioctl->frame[0];
 
 	if (instance->adapterresetinprogress) {
-		con_log(CL_ANN1, (CE_NOTE, "Reset flag set, "
+		con_log(CL_ANN1, (CE_WARN, "Reset flag set, "
 		"returning mfi_pkt and setting TRAN_BUSY\n"));
 		return (DDI_FAILURE);
 	}
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_ILP32"));
 
 		fis_xferlen	= kstp->sgl.sge32[0].length;
 		data_xferlen	= kstp->sgl.sge32[1].length;
 
 		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
 		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
-	}
-	else
-	{
+	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_ILP32"));
 
 		fis_xferlen	= kstp->sgl.sge32[0].length;
 		data_xferlen	= kstp->sgl.sge32[1].length;
@@ -4728,7 +6109,7 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		fis_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
 		data_ubuf	= (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
 #else
-		con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+		con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_LP64"));
 
 		fis_xferlen	= kstp->sgl.sge64[0].length;
 		data_xferlen	= kstp->sgl.sge64[1].length;
@@ -4740,12 +6121,15 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 
 
 	if (fis_xferlen) {
-		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+		con_log(CL_ANN, (CE_CONT, "issue_mfi_stp: "
 		    "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
 
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		fis_dma_obj.size = fis_xferlen;
+		/* fis_dma_obj.size = fis_xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(fis_xferlen,
+		    new_xfer_length1, PAGESIZE);
+		fis_dma_obj.size = new_xfer_length1;
 		fis_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		fis_dma_obj.dma_attr.dma_attr_count_max	= 0xFFFFFFFFU;
@@ -4773,19 +6157,22 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	}
 
 	if (data_xferlen) {
-		con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+		con_log(CL_ANN, (CE_CONT, "issue_mfi_stp: data_ubuf = %p "
 		    "data_xferlen = %x", data_ubuf, data_xferlen));
 
 		/* means IOCTL requires DMA */
 		/* allocate the data transfer buffer */
-		data_dma_obj.size = data_xferlen;
+		/* data_dma_obj.size = data_xferlen; */
+		MRSAS_GET_BOUNDARY_ALIGNED_LEN(data_xferlen, new_xfer_length2,
+		    PAGESIZE);
+		data_dma_obj.size = new_xfer_length2;
 		data_dma_obj.dma_attr = mrsas_generic_dma_attr;
 		data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
 		data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
 		data_dma_obj.dma_attr.dma_attr_sgllen = 1;
 		data_dma_obj.dma_attr.dma_attr_align = 1;
 
-/* allocate kernel buffer for DMA */
+		/* allocate kernel buffer for DMA */
 		if (mrsas_alloc_dma_obj(instance, &data_dma_obj,
 		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
 			con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
@@ -4829,6 +6216,10 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	cmd->sync_cmd = MRSAS_TRUE;
 	cmd->frame_count = 1;
 
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
 	if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
 		con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
 	} else {
@@ -4860,6 +6251,8 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	}
 
 	kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+	con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: stp->cmd_status = %d",
+	    kstp->cmd_status));
 	DTRACE_PROBE2(issue_stp, uint8_t, kstp->cmd, uint8_t, kstp->cmd_status);
 
 	if (fis_xferlen) {
@@ -4880,7 +6273,7 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 /*
  * fill_up_drv_ver
  */
-static void
+void
 fill_up_drv_ver(struct mrsas_drv_ver *dv)
 {
 	(void) memset(dv, 0, sizeof (struct mrsas_drv_ver));
@@ -4891,6 +6284,7 @@ fill_up_drv_ver(struct mrsas_drv_ver *dv)
 	(void) memcpy(dv->drv_ver, MRSAS_VERSION, strlen(MRSAS_VERSION));
 	(void) memcpy(dv->drv_rel_date, MRSAS_RELDATE,
 	    strlen(MRSAS_RELDATE));
+
 }
 
 /*
@@ -4917,7 +6311,7 @@ handle_drv_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 
 	model = ddi_model_convert_from(mode & FMODELS);
 	if (model == DDI_MODEL_ILP32) {
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
 
 		xferlen	= kdcmd->sgl.sge32[0].length;
@@ -4925,23 +6319,23 @@ handle_drv_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
 	} else {
 #ifdef _ILP32
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "handle_drv_ioctl: DDI_MODEL_ILP32"));
 		xferlen	= kdcmd->sgl.sge32[0].length;
 		ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
 #else
-		con_log(CL_ANN1, (CE_NOTE,
+		con_log(CL_ANN1, (CE_CONT,
 		    "handle_drv_ioctl: DDI_MODEL_LP64"));
 		xferlen	= kdcmd->sgl.sge64[0].length;
 		ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
 #endif
 	}
-	con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+	con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: "
 	    "dataBuf=%p size=%d bytes", ubuf, xferlen));
 
 	switch (kdcmd->opcode) {
 	case MRSAS_DRIVER_IOCTL_DRIVER_VERSION:
-		con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+		con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: "
 		    "MRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
 
 		fill_up_drv_ver(&dv);
@@ -5017,8 +6411,11 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	struct mrsas_header	*hdr;
 	struct mrsas_cmd	*cmd;
 
-	cmd = get_mfi_pkt(instance);
-
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 	if (!cmd) {
 		con_log(CL_ANN, (CE_WARN, "mr_sas: "
 		    "failed to get a cmd packet"));
@@ -5026,7 +6423,6 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 		    instance->fw_outstanding, uint16_t, instance->max_fw_cmds);
 		return (DDI_FAILURE);
 	}
-	cmd->retry_count_for_ocr = 0;
 
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
@@ -5059,7 +6455,11 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl,
 	if (mrsas_common_check(instance, cmd) != DDI_SUCCESS)
 		rval = DDI_FAILURE;
 
-	return_mfi_pkt(instance, cmd);
+	if (instance->tbolt) {
+		return_raid_msg_mfi_pkt(instance, cmd);
+	} else {
+		return_mfi_pkt(instance, cmd);
+	}
 
 	return (rval);
 }
@@ -5091,6 +6491,7 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num,
 	union mrsas_evt_class_locale	curr_aen;
 	union mrsas_evt_class_locale	prev_aen;
 
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
 	/*
 	 * If there an AEN pending already (aen_cmd), check if the
 	 * class_locale of that pending AEN is inclusive of the new
@@ -5151,14 +6552,18 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num,
 		curr_aen.members.locale = LE_16(curr_aen.members.locale);
 	}
 
-	cmd = get_mfi_pkt(instance);
+	if (instance->tbolt) {
+		cmd = get_raid_msg_mfi_pkt(instance);
+	} else {
+		cmd = get_mfi_pkt(instance);
+	}
 
 	if (!cmd) {
 		DTRACE_PROBE2(mfi_aen_err, uint16_t, instance->fw_outstanding,
 		    uint16_t, instance->max_fw_cmds);
 		return (ENOMEM);
 	}
-	cmd->retry_count_for_ocr = 0;
+
 	/* Clear the frame buffer and assign back the context id */
 	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
 	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
@@ -5207,12 +6612,15 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num,
 
 	/* Issue the aen registration frame */
 	/* atomic_add_16 (&instance->fw_outstanding, 1); */
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
 	instance->func_ptr->issue_cmd(cmd, instance);
 
 	return (0);
 }
 
-static void
+void
 display_scsi_inquiry(caddr_t scsi_inq)
 {
 #define	MAX_SCSI_DEVICE_CODE	14
@@ -5220,38 +6628,38 @@ display_scsi_inquiry(caddr_t scsi_inq)
 	char		inquiry_buf[256] = {0};
 	int		len;
 	const char	*const scsi_device_types[] = {
-		"Direct-Access    ",
+		"Direct-Access	  ",
 		"Sequential-Access",
-		"Printer          ",
-		"Processor        ",
-		"WORM             ",
-		"CD-ROM           ",
-		"Scanner          ",
-		"Optical Device   ",
-		"Medium Changer   ",
-		"Communications   ",
-		"Unknown          ",
-		"Unknown          ",
-		"Unknown          ",
-		"Enclosure        ",
+		"Printer	  ",
+		"Processor	  ",
+		"WORM		  ",
+		"CD-ROM		  ",
+		"Scanner	  ",
+		"Optical Device	  ",
+		"Medium Changer	  ",
+		"Communications	  ",
+		"Unknown	  ",
+		"Unknown	  ",
+		"Unknown	  ",
+		"Enclosure	  ",
 	};
 
 	len = 0;
 
-	len += snprintf(inquiry_buf + len, 265 - len, "  Vendor: ");
+	len += snprintf(inquiry_buf + len, 265 - len, "	 Vendor: ");
 	for (i = 8; i < 16; i++) {
 		len += snprintf(inquiry_buf + len, 265 - len, "%c",
 		    scsi_inq[i]);
 	}
 
-	len += snprintf(inquiry_buf + len, 265 - len, "  Model: ");
+	len += snprintf(inquiry_buf + len, 265 - len, "	 Model: ");
 
 	for (i = 16; i < 32; i++) {
 		len += snprintf(inquiry_buf + len, 265 - len, "%c",
 		    scsi_inq[i]);
 	}
 
-	len += snprintf(inquiry_buf + len, 265 - len, "  Rev: ");
+	len += snprintf(inquiry_buf + len, 265 - len, "	 Rev: ");
 
 	for (i = 32; i < 36; i++) {
 		len += snprintf(inquiry_buf + len, 265 - len, "%c",
@@ -5264,13 +6672,13 @@ display_scsi_inquiry(caddr_t scsi_inq)
 	i = scsi_inq[0] & 0x1f;
 
 
-	len += snprintf(inquiry_buf + len, 265 - len, "  Type:   %s ",
+	len += snprintf(inquiry_buf + len, 265 - len, "	 Type:	 %s ",
 	    i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
-	    "Unknown          ");
+	    "Unknown	      ");
 
 
 	len += snprintf(inquiry_buf + len, 265 - len,
-	    "                 ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+	    "		      ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
 
 	if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
 		len += snprintf(inquiry_buf + len, 265 - len, " CCS\n");
@@ -5278,7 +6686,7 @@ display_scsi_inquiry(caddr_t scsi_inq)
 		len += snprintf(inquiry_buf + len, 265 - len, "\n");
 	}
 
-	con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+	con_log(CL_DLEVEL2, (CE_CONT, inquiry_buf));
 }
 
 static void
@@ -5294,8 +6702,9 @@ io_timeout_checker(void *arg)
 	mlist_t			process_list;
 
 	if (instance->adapterresetinprogress == 1) {
-		con_log(CL_ANN1, (CE_NOTE, "io_timeout_checker"
+		con_log(CL_ANN, (CE_NOTE, "io_timeout_checker:"
 		    " reset in progress"));
+
 		instance->timeout_id = timeout(io_timeout_checker,
 		    (void *) instance, drv_usectohz(MRSAS_1_SECOND));
 		return;
@@ -5303,10 +6712,18 @@ io_timeout_checker(void *arg)
 
 	/* See if this check needs to be in the beginning or last in ISR */
 	if (mrsas_initiate_ocr_if_fw_is_faulty(instance) ==  1) {
-		con_log(CL_ANN1, (CE_NOTE,
-		    "Fw Fault state Handling in io_timeout_checker"));
+		cmn_err(CE_WARN, "io_timeout_checker: "
+		    "FW Fault, calling reset adapter");
+		cmn_err(CE_CONT, "io_timeout_checker: "
+		    "fw_outstanding 0x%X max_fw_cmds 0x%X",
+		    instance->fw_outstanding, instance->max_fw_cmds);
 		if (instance->adapterresetinprogress == 0) {
-			(void) mrsas_reset_ppc(instance);
+			instance->adapterresetinprogress = 1;
+			if (instance->tbolt)
+				(void) mrsas_tbolt_reset_ppc(instance);
+			else
+				(void) mrsas_reset_ppc(instance);
+			instance->adapterresetinprogress = 0;
 		}
 		instance->timeout_id = timeout(io_timeout_checker,
 		    (void *) instance, drv_usectohz(MRSAS_1_SECOND));
@@ -5337,10 +6754,12 @@ io_timeout_checker(void *arg)
 			time = --cmd->drv_pkt_time;
 		}
 		if (time <= 0) {
-			con_log(CL_ANN1, (CE_NOTE, "%llx: "
-			    "io_timeout_checker: TIMING OUT: pkt "
-			    ": %p, cmd %p", gethrtime(), (void *)pkt,
-			    (void *)cmd));
+			cmn_err(CE_WARN, "%llx: "
+			    "io_timeout_checker: TIMING OUT: pkt: %p, "
+			    "cmd %p fw_outstanding 0x%X max_fw_cmds 0x%X\n",
+			    gethrtime(), (void *)pkt, (void *)cmd,
+			    instance->fw_outstanding, instance->max_fw_cmds);
+
 			counter++;
 			break;
 		}
@@ -5348,52 +6767,57 @@ io_timeout_checker(void *arg)
 	mutex_exit(&instance->cmd_pend_mtx);
 
 	if (counter) {
-		con_log(CL_ANN1, (CE_NOTE,
-		    "io_timeout_checker "
-		    "cmd->retrycount_for_ocr %d, "
-		    "cmd index %d , cmd address %p ",
-		    cmd->retry_count_for_ocr+1, cmd->index, (void *)cmd));
-
 		if (instance->disable_online_ctrl_reset == 1) {
-			con_log(CL_ANN1, (CE_NOTE, "mrsas: "
-			    "OCR is not supported by the Firmware "
-			    "Failing all the queued packets \n"));
+			cmn_err(CE_WARN, "mr_sas %d: %s(): OCR is NOT "
+			    "supported by Firmware, KILL adapter!!!",
+			    instance->instance, __func__);
+
+			if (instance->tbolt)
+				mrsas_tbolt_kill_adapter(instance);
+			else
+				(void) mrsas_kill_adapter(instance);
 
-			(void) mrsas_kill_adapter(instance);
 			return;
 		} else {
-			if (cmd->retry_count_for_ocr <=  IO_RETRY_COUNT) {
+			if (cmd->retry_count_for_ocr <=	 IO_RETRY_COUNT) {
 				if (instance->adapterresetinprogress == 0) {
-				con_log(CL_ANN1, (CE_NOTE, "mrsas: "
-				    "OCR is supported by FW "
-				    "triggering  mrsas_reset_ppc"));
-				(void) mrsas_reset_ppc(instance);
+					if (instance->tbolt) {
+						(void) mrsas_tbolt_reset_ppc(
+						    instance);
+					} else {
+						(void) mrsas_reset_ppc(
+						    instance);
+					}
 				}
 			} else {
-				con_log(CL_ANN1, (CE_NOTE,
-				    "io_timeout_checker:"
-				    " cmdindex: %d,cmd address: %p "
+				cmn_err(CE_WARN,
+				    "io_timeout_checker: "
+				    "cmd %p cmd->index %d "
 				    "timed out even after 3 resets: "
-				    "so kill adapter", cmd->index,
-				    (void *)cmd));
-				(void) mrsas_kill_adapter(instance);
+				    "so KILL adapter", (void *)cmd, cmd->index);
+
+				mrsas_print_cmd_details(instance, cmd, 0xDD);
+
+				if (instance->tbolt)
+					mrsas_tbolt_kill_adapter(instance);
+				else
+					(void) mrsas_kill_adapter(instance);
 				return;
 			}
 		}
 	}
-
-
-	con_log(CL_ANN1, (CE_NOTE, "mrsas: "
+	con_log(CL_ANN, (CE_NOTE, "mrsas: "
 	    "schedule next timeout check: "
 	    "do timeout \n"));
 	instance->timeout_id =
 	    timeout(io_timeout_checker, (void *)instance,
 	    drv_usectohz(MRSAS_1_SECOND));
 }
-static int
+
+static uint32_t
 read_fw_status_reg_ppc(struct mrsas_instance *instance)
 {
-	return ((int)RD_OB_SCRATCH_PAD_0(instance));
+	return ((uint32_t)RD_OB_SCRATCH_PAD_0(instance));
 }
 
 static void
@@ -5404,7 +6828,7 @@ issue_cmd_ppc(struct mrsas_cmd *cmd, struct mrsas_instance *instance)
 
 	pkt = cmd->pkt;
 	if (pkt) {
-		con_log(CL_ANN1, (CE_CONT, "%llx : issue_cmd_ppc:"
+		con_log(CL_DLEVEL1, (CE_NOTE, "%llx : issue_cmd_ppc:"
 		    "ISSUED CMD TO FW : called : cmd:"
 		    ": %p instance : %p pkt : %p pkt_time : %x\n",
 		    gethrtime(), (void *)cmd, (void *)instance,
@@ -5417,13 +6841,18 @@ issue_cmd_ppc(struct mrsas_cmd *cmd, struct mrsas_instance *instance)
 		}
 
 	} else {
-		con_log(CL_ANN1, (CE_CONT, "%llx : issue_cmd_ppc:"
+		con_log(CL_DLEVEL1, (CE_NOTE, "%llx : issue_cmd_ppc:"
 		    "ISSUED CMD TO FW : called : cmd : %p, instance: %p"
 		    "(NO PKT)\n", gethrtime(), (void *)cmd, (void *)instance));
 	}
+
+	mutex_enter(&instance->reg_write_mtx);
+	ASSERT(mutex_owned(&instance->reg_write_mtx));
 	/* Issue the command to the FW */
 	WR_IB_QPORT((cmd->frame_phys_addr) |
 	    (((cmd->frame_count - 1) << 1) | 1), instance);
+	mutex_exit(&instance->reg_write_mtx);
+
 }
 
 /*
@@ -5444,10 +6873,12 @@ struct mrsas_cmd *cmd)
 		    cmd->frame_dma_obj.acc_handle, &hdr->timeout);
 		if (cmd->drv_pkt_time < debug_timeout_g)
 			cmd->drv_pkt_time = (uint16_t)debug_timeout_g;
+
 		con_log(CL_ANN1, (CE_NOTE, "sync_mode_ppc: "
 		    "issue and return in reset case\n"));
 		WR_IB_QPORT((cmd->frame_phys_addr) |
 		    (((cmd->frame_count - 1) << 1) | 1), instance);
+
 		return (DDI_SUCCESS);
 	} else {
 		con_log(CL_ANN1, (CE_NOTE, "sync_mode_ppc: pushing the pkt\n"));
@@ -5456,15 +6887,17 @@ struct mrsas_cmd *cmd)
 
 	cmd->cmd_status	= ENODATA;
 
+	mutex_enter(&instance->reg_write_mtx);
+	ASSERT(mutex_owned(&instance->reg_write_mtx));
+	/* Issue the command to the FW */
 	WR_IB_QPORT((cmd->frame_phys_addr) |
 	    (((cmd->frame_count - 1) << 1) | 1), instance);
+	mutex_exit(&instance->reg_write_mtx);
 
 	mutex_enter(&instance->int_cmd_mtx);
-
 	for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
 		cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
 	}
-
 	mutex_exit(&instance->int_cmd_mtx);
 
 	con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
@@ -5494,7 +6927,7 @@ issue_cmd_in_poll_mode_ppc(struct mrsas_instance *instance,
 	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
 	    MFI_CMD_STATUS_POLL_MODE);
 	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
-	flags 	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+	flags	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
 
 	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
 
@@ -5511,7 +6944,7 @@ issue_cmd_in_poll_mode_ppc(struct mrsas_instance *instance,
 
 	if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
 	    == MFI_CMD_STATUS_POLL_MODE) {
-		con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode: "
+		con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
 		    "cmd polling timed out"));
 		return (DDI_FAILURE);
 	}
@@ -5607,18 +7040,18 @@ intr_ack_ppc(struct mrsas_instance *instance)
 static int
 mrsas_kill_adapter(struct mrsas_instance *instance)
 {
-		if (instance->deadadapter == 1)
-			return (DDI_FAILURE);
+	if (instance->deadadapter == 1)
+		return (DDI_FAILURE);
 
-		con_log(CL_ANN1, (CE_NOTE, "mrsas_kill_adapter: "
-		    "Writing to doorbell with MFI_STOP_ADP "));
-		mutex_enter(&instance->ocr_flags_mtx);
-		instance->deadadapter = 1;
-		mutex_exit(&instance->ocr_flags_mtx);
-		instance->func_ptr->disable_intr(instance);
-		WR_IB_DOORBELL(MFI_STOP_ADP, instance);
-		(void) mrsas_complete_pending_cmds(instance);
-		return (DDI_SUCCESS);
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_kill_adapter: "
+	    "Writing to doorbell with MFI_STOP_ADP "));
+	mutex_enter(&instance->ocr_flags_mtx);
+	instance->deadadapter = 1;
+	mutex_exit(&instance->ocr_flags_mtx);
+	instance->func_ptr->disable_intr(instance);
+	WR_IB_DOORBELL(MFI_STOP_ADP, instance);
+	(void) mrsas_complete_pending_cmds(instance);
+	return (DDI_SUCCESS);
 }
 
 
@@ -5630,9 +7063,11 @@ mrsas_reset_ppc(struct mrsas_instance *instance)
 	uint32_t cur_abs_reg_val;
 	uint32_t fw_state;
 
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
 	if (instance->deadadapter == 1) {
-		con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
-		    "no more resets as HBA has been marked dead "));
+		cmn_err(CE_WARN, "mrsas_reset_ppc: "
+		    "no more resets as HBA has been marked dead ");
 		return (DDI_FAILURE);
 	}
 	mutex_enter(&instance->ocr_flags_mtx);
@@ -5640,6 +7075,7 @@ mrsas_reset_ppc(struct mrsas_instance *instance)
 	mutex_exit(&instance->ocr_flags_mtx);
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: adpterresetinprogress "
 	    "flag set, time %llx", gethrtime()));
+
 	instance->func_ptr->disable_intr(instance);
 retry_reset:
 	WR_IB_WRITE_SEQ(0, instance);
@@ -5657,8 +7093,8 @@ retry_reset:
 		delay(100 * drv_usectohz(MILLISEC));
 		status = RD_OB_DRWE(instance);
 		if (retry++ == 100) {
-			con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: DRWE bit "
-			    "check retry count %d\n", retry));
+			cmn_err(CE_WARN, "mrsas_reset_ppc: DRWE bit "
+			    "check retry count %d\n", retry);
 			return (DDI_FAILURE);
 		}
 	}
@@ -5669,11 +7105,14 @@ retry_reset:
 		delay(100 * drv_usectohz(MILLISEC));
 		status = RD_OB_DRWE(instance);
 		if (retry++ == 100) {
+			cmn_err(CE_WARN, "mrsas_reset_ppc: "
+			    "RESET FAILED. KILL adapter called\n.");
+
 			(void) mrsas_kill_adapter(instance);
 			return (DDI_FAILURE);
 		}
 	}
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: Adapter reset complete"));
+	con_log(CL_ANN, (CE_NOTE, "mrsas_reset_ppc: Adapter reset complete"));
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "Calling mfi_state_transition_to_ready"));
 
@@ -5700,15 +7139,18 @@ retry_reset:
 			instance->fw_fault_count_after_ocr++;
 			if (instance->fw_fault_count_after_ocr
 			    < MAX_FW_RESET_COUNT) {
-				con_log(CL_ANN1, (CE_WARN, "mrsas_reset_ppc: "
-				    "FW is in fault after OCR count %d ",
-				    instance->fw_fault_count_after_ocr));
+				cmn_err(CE_WARN, "mrsas_reset_ppc: "
+				    "FW is in fault after OCR count %d "
+				    "Retry Reset",
+				    instance->fw_fault_count_after_ocr);
 				goto retry_reset;
 
 			} else {
-				con_log(CL_ANN1, (CE_WARN, "mrsas_reset_ppc: "
-				    "Max Reset Count exceeded "
-				    "Mark HBA as bad"));
+				cmn_err(CE_WARN, "mrsas_reset_ppc: "
+				    "Max Reset Count exceeded >%d"
+				    "Mark HBA as bad, KILL adapter",
+				    MAX_FW_RESET_COUNT);
+
 				(void) mrsas_kill_adapter(instance);
 				return (DDI_FAILURE);
 			}
@@ -5734,37 +7176,52 @@ retry_reset:
 	(void) mrsas_issue_init_mfi(instance);
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "mrsas_issue_init_mfi Done"));
+
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "Calling mrsas_print_pending_cmd\n"));
 	(void) mrsas_print_pending_cmds(instance);
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "mrsas_print_pending_cmd done\n"));
+
 	instance->func_ptr->enable_intr(instance);
 	instance->fw_outstanding = 0;
+
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "Calling mrsas_issue_pending_cmds"));
 	(void) mrsas_issue_pending_cmds(instance);
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
-	"Complete"));
+	"issue_pending_cmds done.\n"));
+
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "Calling aen registration"));
+
+
+	instance->aen_cmd->retry_count_for_ocr = 0;
+	instance->aen_cmd->drv_pkt_time = 0;
+
 	instance->func_ptr->issue_cmd(instance->aen_cmd, instance);
 	con_log(CL_ANN1, (CE_NOTE, "Unsetting adpresetinprogress flag.\n"));
+
 	mutex_enter(&instance->ocr_flags_mtx);
 	instance->adapterresetinprogress = 0;
 	mutex_exit(&instance->ocr_flags_mtx);
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: "
 	    "adpterresetinprogress flag unset"));
+
 	con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc done\n"));
 	return (DDI_SUCCESS);
 }
-static int
-mrsas_common_check(struct mrsas_instance *instance,
-    struct  mrsas_cmd *cmd)
+
+/*
+ * FMA functions.
+ */
+int
+mrsas_common_check(struct mrsas_instance *instance, struct  mrsas_cmd *cmd)
 {
 	int ret = DDI_SUCCESS;
 
-	if (mrsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+	if (cmd != NULL &&
+	    mrsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
 	    DDI_SUCCESS) {
 		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
 		if (cmd->pkt != NULL) {
@@ -5776,7 +7233,7 @@ mrsas_common_check(struct mrsas_instance *instance,
 	if (mrsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
 	    != DDI_SUCCESS) {
 		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
-		if (cmd->pkt != NULL) {
+		if (cmd != NULL && cmd->pkt != NULL) {
 			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
 			cmd->pkt->pkt_statistics = 0;
 		}
@@ -5785,7 +7242,7 @@ mrsas_common_check(struct mrsas_instance *instance,
 	if (mrsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
 	    DDI_SUCCESS) {
 		ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
-		if (cmd->pkt != NULL) {
+		if (cmd != NULL && cmd->pkt != NULL) {
 			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
 			cmd->pkt->pkt_statistics = 0;
 		}
@@ -5796,7 +7253,7 @@ mrsas_common_check(struct mrsas_instance *instance,
 
 		ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
 
-		if (cmd->pkt != NULL) {
+		if (cmd != NULL && cmd->pkt != NULL) {
 			cmd->pkt->pkt_reason = CMD_TRAN_ERR;
 			cmd->pkt->pkt_statistics = 0;
 		}
@@ -5940,7 +7397,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 	int	avail, actual, count;
 	int	i, flag, ret;
 
-	con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: intr_type = %x",
+	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: intr_type = %x",
 	    intr_type));
 
 	/* Get number of interrupts */
@@ -5952,7 +7409,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 		return (DDI_FAILURE);
 	}
 
-	con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: count = %d ", count));
+	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: count = %d ", count));
 
 	/* Get number of available interrupts */
 	ret = ddi_intr_get_navail(dip, intr_type, &avail);
@@ -5962,7 +7419,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 
 		return (DDI_FAILURE);
 	}
-	con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: avail = %d ", avail));
+	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: avail = %d ", avail));
 
 	/* Only one interrupt routine. So limit the count to 1 */
 	if (count > 1) {
@@ -5973,12 +7430,19 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 	 * Allocate an array of interrupt handlers. Currently we support
 	 * only one interrupt. The framework can be extended later.
 	 */
-	instance->intr_size = count * sizeof (ddi_intr_handle_t);
-	instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
-	ASSERT(instance->intr_htable);
+	instance->intr_htable_size = count * sizeof (ddi_intr_handle_t);
+	instance->intr_htable = kmem_zalloc(instance->intr_htable_size,
+	    KM_SLEEP);
+	if (instance->intr_htable == NULL) {
+		con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: "
+		    "failed to allocate memory for intr-handle table"));
+		instance->intr_htable_size = 0;
+		return (DDI_FAILURE);
+	}
 
-	flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
-	    DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+	flag = ((intr_type == DDI_INTR_TYPE_MSI) ||
+	    (intr_type == DDI_INTR_TYPE_MSIX)) ?
+	    DDI_INTR_ALLOC_STRICT : DDI_INTR_ALLOC_NORMAL;
 
 	/* Allocate interrupt */
 	ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
@@ -5987,9 +7451,9 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 	if ((ret != DDI_SUCCESS) || (actual == 0)) {
 		con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: "
 		    "avail = %d", avail));
-		kmem_free(instance->intr_htable, instance->intr_size);
-		return (DDI_FAILURE);
+		goto mrsas_free_htable;
 	}
+
 	if (actual < count) {
 		con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: "
 		    "Requested = %d  Received = %d", count, actual));
@@ -6003,12 +7467,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 	    &instance->intr_pri)) != DDI_SUCCESS) {
 		con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: "
 		    "get priority call failed"));
-
-		for (i = 0; i < actual; i++) {
-			(void) ddi_intr_free(instance->intr_htable[i]);
-		}
-		kmem_free(instance->intr_htable, instance->intr_size);
-		return (DDI_FAILURE);
+		goto mrsas_free_handles;
 	}
 
 	/*
@@ -6017,12 +7476,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 	if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
 		con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: "
 		    "High level interrupts not supported."));
-
-		for (i = 0; i < actual; i++) {
-			(void) ddi_intr_free(instance->intr_htable[i]);
-		}
-		kmem_free(instance->intr_htable, instance->intr_size);
-		return (DDI_FAILURE);
+		goto mrsas_free_handles;
 	}
 
 	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: intr_pri = 0x%x ",
@@ -6037,31 +7491,18 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 		if (ret != DDI_SUCCESS) {
 			con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs:"
 			    "failed %d", ret));
-
-			for (i = 0; i < actual; i++) {
-				(void) ddi_intr_free(instance->intr_htable[i]);
-			}
-			kmem_free(instance->intr_htable, instance->intr_size);
-			return (DDI_FAILURE);
+			goto mrsas_free_handles;
 		}
 
 	}
 
-	con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+	con_log(CL_DLEVEL1, (CE_NOTE, " ddi_intr_add_handler done"));
 
 	if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
 	    &instance->intr_cap)) != DDI_SUCCESS) {
 		con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
 		    ret));
-
-		/* Free already allocated intr */
-		for (i = 0; i < actual; i++) {
-			(void) ddi_intr_remove_handler(
-			    instance->intr_htable[i]);
-			(void) ddi_intr_free(instance->intr_htable[i]);
-		}
-		kmem_free(instance->intr_htable, instance->intr_size);
-		return (DDI_FAILURE);
+		goto mrsas_free_handlers;
 	}
 
 	if (instance->intr_cap &  DDI_INTR_FLAG_BLOCK) {
@@ -6081,6 +7522,23 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type)
 
 	return (DDI_SUCCESS);
 
+mrsas_free_handlers:
+	for (i = 0; i < actual; i++)
+		(void) ddi_intr_remove_handler(instance->intr_htable[i]);
+
+mrsas_free_handles:
+	for (i = 0; i < actual; i++)
+		(void) ddi_intr_free(instance->intr_htable[i]);
+
+mrsas_free_htable:
+	if (instance->intr_htable != NULL)
+		kmem_free(instance->intr_htable, instance->intr_htable_size);
+
+	instance->intr_htable = NULL;
+	instance->intr_htable_size = 0;
+
+	return (DDI_FAILURE);
+
 }
 
 
@@ -6108,7 +7566,12 @@ mrsas_rem_intrs(struct mrsas_instance *instance)
 		(void) ddi_intr_free(instance->intr_htable[i]);
 	}
 
-	kmem_free(instance->intr_htable, instance->intr_size);
+	if (instance->intr_htable != NULL)
+		kmem_free(instance->intr_htable, instance->intr_htable_size);
+
+	instance->intr_htable = NULL;
+	instance->intr_htable_size = 0;
+
 }
 
 static int
@@ -6117,7 +7580,7 @@ mrsas_tran_bus_config(dev_info_t *parent, uint_t flags,
 {
 	struct mrsas_instance *instance;
 	int config;
-	int rval;
+	int rval  = NDI_SUCCESS;
 
 	char *ptr = NULL;
 	int tgt, lun;
@@ -6148,6 +7611,11 @@ mrsas_tran_bus_config(dev_info_t *parent, uint_t flags,
 
 		if (lun == 0) {
 			rval = mrsas_config_ld(instance, tgt, lun, childp);
+#ifdef PDSUPPORT
+		} else if (instance->tbolt == 1 && lun != 0) {
+			rval = mrsas_tbolt_config_pd(instance,
+			    tgt, lun, childp);
+#endif
 		} else {
 			rval = NDI_FAILURE;
 		}
@@ -6185,6 +7653,15 @@ mrsas_config_all_devices(struct mrsas_instance *instance)
 
 	}
 
+#ifdef PDSUPPORT
+	/* Config PD devices connected to the card */
+	if (instance->tbolt) {
+		for (tgt = 0; tgt < instance->mr_tbolt_pd_max; tgt++) {
+			(void) mrsas_tbolt_config_pd(instance, tgt, 1, NULL);
+		}
+	}
+#endif
+
 	rval = NDI_SUCCESS;
 	return (rval);
 }
@@ -6241,20 +7718,30 @@ mrsas_config_ld(struct mrsas_instance *instance, uint16_t tgt,
 	dev_info_t *child;
 	int rval;
 
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_config_ld: t = %d l = %d",
+	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_config_ld: t = %d l = %d",
 	    tgt, lun));
 
 	if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) {
 		if (ldip) {
 			*ldip = child;
 		}
-		con_log(CL_ANN1, (CE_NOTE,
-		    "mrsas_config_ld: Child = %p found t = %d l = %d",
-		    (void *)child, tgt, lun));
+		if (instance->mr_ld_list[tgt].flag != MRDRV_TGT_VALID) {
+			rval = mrsas_service_evt(instance, tgt, 0,
+			    MRSAS_EVT_UNCONFIG_TGT, NULL);
+			con_log(CL_ANN1, (CE_WARN,
+			    "mr_sas: DELETING STALE ENTRY rval = %d "
+			    "tgt id = %d ", rval, tgt));
+			return (NDI_FAILURE);
+		}
 		return (NDI_SUCCESS);
 	}
 
 	sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+	if (sd == NULL) {
+		con_log(CL_ANN1, (CE_WARN, "mrsas_config_ld: "
+		    "failed to allocate mem for scsi_device"));
+		return (NDI_FAILURE);
+	}
 	sd->sd_address.a_hba_tran = instance->tran;
 	sd->sd_address.a_target = (uint16_t)tgt;
 	sd->sd_address.a_lun = (uint8_t)lun;
@@ -6271,12 +7758,12 @@ mrsas_config_ld(struct mrsas_instance *instance, uint16_t tgt,
 	}
 
 	kmem_free(sd, sizeof (struct scsi_device));
-	con_log(CL_ANN1, (CE_NOTE, "mrsas_config_ld: return rval = %d",
+	con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_config_ld: return rval = %d",
 	    rval));
 	return (rval);
 }
 
-static int
+int
 mrsas_config_scsi_device(struct mrsas_instance *instance,
     struct scsi_device *sd, dev_info_t **dipp)
 {
@@ -6290,7 +7777,7 @@ mrsas_config_scsi_device(struct mrsas_instance *instance,
 	int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
 	int rval;
 
-	con_log(CL_ANN1, (CE_WARN, "mr_sas: scsi_device t%dL%d", tgt, lun));
+	con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: scsi_device t%dL%d", tgt, lun));
 	scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
 	    NULL, &nodename, &compatible, &ncompatible);
 
@@ -6302,12 +7789,12 @@ mrsas_config_scsi_device(struct mrsas_instance *instance,
 	}
 
 	childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
-	con_log(CL_ANN1, (CE_WARN,
+	con_log(CL_DLEVEL1, (CE_NOTE,
 	    "mr_sas: Childname = %2s nodename = %s", childname, nodename));
 
 	/* Create a dev node */
 	rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
-	con_log(CL_ANN1, (CE_WARN,
+	con_log(CL_DLEVEL1, (CE_NOTE,
 	    "mr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
 	if (rval == NDI_SUCCESS) {
 		if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
@@ -6341,7 +7828,7 @@ mrsas_config_scsi_device(struct mrsas_instance *instance,
 			ndi_prop_remove_all(ldip);
 			(void) ndi_devi_free(ldip);
 		} else {
-			con_log(CL_ANN1, (CE_WARN, "mr_sas: online Done :"
+			con_log(CL_ANN1, (CE_CONT, "mr_sas: online Done :"
 			    "0 t%dl%d", tgt, lun));
 		}
 
@@ -6351,7 +7838,7 @@ finish:
 		*dipp = ldip;
 	}
 
-	con_log(CL_DLEVEL1, (CE_WARN,
+	con_log(CL_DLEVEL1, (CE_NOTE,
 	    "mr_sas: config_scsi_device rval = %d t%dL%d",
 	    rval, tgt, lun));
 	scsi_hba_nodename_compatible_free(nodename, compatible);
@@ -6359,7 +7846,7 @@ finish:
 }
 
 /*ARGSUSED*/
-static int
+int
 mrsas_service_evt(struct mrsas_instance *instance, int tgt, int lun, int event,
     uint64_t wwn)
 {
@@ -6378,6 +7865,7 @@ mrsas_service_evt(struct mrsas_instance *instance, int tgt, int lun, int event,
 	mrevt->tgt = tgt;
 	mrevt->lun = lun;
 	mrevt->event = event;
+	mrevt->wwn = wwn;
 
 	if ((ddi_taskq_dispatch(instance->taskq,
 	    (void (*)(void *))mrsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
@@ -6405,11 +7893,18 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt)
 	    mrevt->tgt, mrevt->lun, mrevt->event));
 
 	if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+		mutex_enter(&instance->config_dev_mtx);
 		dip = instance->mr_ld_list[mrevt->tgt].dip;
+		mutex_exit(&instance->config_dev_mtx);
+#ifdef PDSUPPORT
 	} else {
-		return;
+		mutex_enter(&instance->config_dev_mtx);
+		dip = instance->mr_tbolt_pd_list[mrevt->tgt].dip;
+		mutex_exit(&instance->config_dev_mtx);
+#endif
 	}
 
+
 	ndi_devi_enter(instance->dip, &circ1);
 	switch (mrevt->event) {
 	case MRSAS_EVT_CONFIG_TGT:
@@ -6418,6 +7913,12 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt)
 			if (mrevt->lun == 0) {
 				(void) mrsas_config_ld(instance, mrevt->tgt,
 				    0, NULL);
+#ifdef PDSUPPORT
+			} else if (instance->tbolt) {
+				(void) mrsas_tbolt_config_pd(instance,
+				    mrevt->tgt,
+				    1, NULL);
+#endif
 			}
 			con_log(CL_ANN1, (CE_NOTE,
 			    "mr_sas: EVT_CONFIG_TGT called:"
@@ -6461,11 +7962,12 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt)
 	ndi_devi_exit(instance->dip, circ1);
 }
 
-static int
+
+int
 mrsas_mode_sense_build(struct scsi_pkt *pkt)
 {
 	union scsi_cdb		*cdbp;
-	uint16_t 		page_code;
+	uint16_t		page_code;
 	struct scsa_cmd		*acmd;
 	struct buf		*bp;
 	struct mode_header	*modehdrp;
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf
index 73bc8253d7..73cb981b48 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.conf
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf
@@ -1,13 +1,18 @@
 #
-# Copyright (c) 2008-2009, LSI Logic Corporation.
+# Copyright (c) 2008-2012, LSI Logic Corporation.
 # All rights reserved.
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 
 #
 # mr_sas.conf for sol 10 (and later) for all supported architectures
 #
 # global definitions
 
+flow_control="dmult" queue="qsort" tape="sctp";
+
+# MSI specific flag. To enable MSI modify the flag value to "yes"
+mrsas-enable-msi="yes";
+
+# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes"
+mrsas-enable-fp="yes";
+
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.h b/usr/src/uts/common/io/mr_sas/mr_sas.h
index e56bb68d15..3e297baaed 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.h
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.h
@@ -2,9 +2,17 @@
  * mr_sas.h: header for mr_sas
  *
  * Solaris MegaRAID driver for SAS2.0 controllers
- * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
  * All rights reserved.
  *
+ * Version:
+ * Author:
+ *		Swaminathan K S
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rasheed
+ *		Shakeel Bukhari
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
@@ -36,6 +44,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+
 #ifndef	_MR_SAS_H_
 #define	_MR_SAS_H_
 
@@ -45,12 +54,13 @@ extern "C" {
 
 #include <sys/scsi/scsi.h>
 #include "mr_sas_list.h"
+#include "ld_pd_map.h"
 
 /*
  * MegaRAID SAS2.0 Driver meta data
  */
-#define	MRSAS_VERSION				"LSIv2.7"
-#define	MRSAS_RELDATE				"Apr 21, 2010"
+#define	MRSAS_VERSION				"6.503.00.00ILLUMOS"
+#define	MRSAS_RELDATE				"July 30, 2012"
 
 #define	MRSAS_TRUE				1
 #define	MRSAS_FALSE				0
@@ -58,16 +68,32 @@ extern "C" {
 #define	ADAPTER_RESET_NOT_REQUIRED		0
 #define	ADAPTER_RESET_REQUIRED			1
 
+#define	PDSUPPORT	1
+
+#define	SWAP_BYTES(w)	((((w)>>8)&0xFF) | (((w)&0xFF)<<8))
+#define	BIG_ENDIAN(d)	(SWAP_BYTES((d) >> 16) | (SWAP_BYTES(d) << 16))
 /*
  * MegaRAID SAS2.0 device id conversion definitions.
  */
 #define	INST2LSIRDCTL(x)		((x) << INST_MINOR_SHIFT)
+#define	MRSAS_GET_BOUNDARY_ALIGNED_LEN(len, new_len, boundary_len)  { \
+	int rem; \
+	rem = (len / boundary_len); \
+	if ((rem * boundary_len) != len) { \
+		new_len = len + ((rem + 1) * boundary_len - len); \
+	} else { \
+		new_len = len; \
+	} \
+}
+
 
 /*
  * MegaRAID SAS2.0 supported controllers
  */
 #define	PCI_DEVICE_ID_LSI_2108VDE		0x0078
 #define	PCI_DEVICE_ID_LSI_2108V			0x0079
+#define	PCI_DEVICE_ID_LSI_TBOLT			0x005b
+#define	PCI_DEVICE_ID_LSI_INVADER		0x005d
 
 /*
  * Register Index for 2108 Controllers.
@@ -75,6 +101,7 @@ extern "C" {
 #define	REGISTER_SET_IO_2108			(2)
 
 #define	MRSAS_MAX_SGE_CNT			0x50
+#define	MRSAS_APP_RESERVED_CMDS			32
 
 #define	MRSAS_IOCTL_DRIVER			0x12341234
 #define	MRSAS_IOCTL_FIRMWARE			0x12345678
@@ -82,13 +109,50 @@ extern "C" {
 
 #define	MRSAS_1_SECOND				1000000
 
+#ifdef PDSUPPORT
+
+#define	UNCONFIGURED_GOOD			0x0
+#define	PD_SYSTEM				0x40
+#define	MR_EVT_PD_STATE_CHANGE			0x0072
+#define	MR_EVT_PD_REMOVED_EXT		0x00f8
+#define	MR_EVT_PD_INSERTED_EXT		0x00f7
+#define	MR_DCMD_PD_GET_INFO			0x02020000
+#define	MRSAS_TBOLT_PD_LUN		1
+#define	MRSAS_TBOLT_PD_TGT_MAX	255
+#define	MRSAS_TBOLT_GET_PD_MAX(s)	((s)->mr_tbolt_pd_max)
+
+#endif
+
+/* Raid Context Flags */
+#define	MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_SHIFT 0x4
+#define	MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_MASK 0x30
+typedef enum MR_RAID_FLAGS_IO_SUB_TYPE {
+	MR_RAID_FLAGS_IO_SUB_TYPE_NONE = 0,
+	MR_RAID_FLAGS_IO_SUB_TYPE_SYSTEM_PD = 1
+} MR_RAID_FLAGS_IO_SUB_TYPE;
+
 /* Dynamic Enumeration Flags */
-#define	MRSAS_PD_LUN		1
 #define	MRSAS_LD_LUN		0
-#define	MRSAS_PD_TGT_MAX	255
-#define	MRSAS_GET_PD_MAX(s)	((s)->mr_pd_max)
 #define	WWN_STRLEN		17
-#define		APP_RESERVE_CMDS		32
+#define	LD_SYNC_BIT	1
+#define	LD_SYNC_SHIFT	14
+/* ThunderBolt (TB) specific */
+#define	MRSAS_THUNDERBOLT_MSG_SIZE		256
+#define	MRSAS_THUNDERBOLT_MAX_COMMANDS		1024
+#define	MRSAS_THUNDERBOLT_MAX_REPLY_COUNT	1024
+#define	MRSAS_THUNDERBOLT_REPLY_SIZE		8
+#define	MRSAS_THUNDERBOLT_MAX_CHAIN_COUNT	1
+
+#define	MPI2_FUNCTION_PASSTHRU_IO_REQUEST	0xF0
+#define	MPI2_FUNCTION_LD_IO_REQUEST		0xF1
+
+#define	MR_EVT_LD_FAST_PATH_IO_STATUS_CHANGED	(0xFFFF)
+
+#define	MR_INTERNAL_MFI_FRAMES_SMID		1
+#define	MR_CTRL_EVENT_WAIT_SMID			2
+#define	MR_INTERNAL_DRIVER_RESET_SMID		3
+
+
 /*
  * =====================================
  * MegaRAID SAS2.0 MFI firmware definitions
@@ -103,19 +167,18 @@ extern "C" {
 /*
  * FW posts its state in upper 4 bits of outbound_msg_0 register
  */
-#define	MFI_STATE_SHIFT 			28
-#define	MFI_STATE_MASK				((uint32_t)0xF<<MFI_STATE_SHIFT)
-#define	MFI_STATE_UNDEFINED			((uint32_t)0x0<<MFI_STATE_SHIFT)
-#define	MFI_STATE_BB_INIT			((uint32_t)0x1<<MFI_STATE_SHIFT)
-#define	MFI_STATE_FW_INIT			((uint32_t)0x4<<MFI_STATE_SHIFT)
-#define	MFI_STATE_WAIT_HANDSHAKE		((uint32_t)0x6<<MFI_STATE_SHIFT)
-#define	MFI_STATE_FW_INIT_2			((uint32_t)0x7<<MFI_STATE_SHIFT)
-#define	MFI_STATE_DEVICE_SCAN			((uint32_t)0x8<<MFI_STATE_SHIFT)
-#define	MFI_STATE_BOOT_MESSAGE_PENDING		((uint32_t)0x9<<MFI_STATE_SHIFT)
-#define	MFI_STATE_FLUSH_CACHE			((uint32_t)0xA<<MFI_STATE_SHIFT)
-#define	MFI_STATE_READY				((uint32_t)0xB<<MFI_STATE_SHIFT)
-#define	MFI_STATE_OPERATIONAL			((uint32_t)0xC<<MFI_STATE_SHIFT)
-#define	MFI_STATE_FAULT				((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define	MFI_STATE_MASK				0xF0000000
+#define	MFI_STATE_UNDEFINED			0x00000000
+#define	MFI_STATE_BB_INIT			0x10000000
+#define	MFI_STATE_FW_INIT			0x40000000
+#define	MFI_STATE_WAIT_HANDSHAKE		0x60000000
+#define	MFI_STATE_FW_INIT_2			0x70000000
+#define	MFI_STATE_DEVICE_SCAN			0x80000000
+#define	MFI_STATE_BOOT_MESSAGE_PENDING		0x90000000
+#define	MFI_STATE_FLUSH_CACHE			0xA0000000
+#define	MFI_STATE_READY				0xB0000000
+#define	MFI_STATE_OPERATIONAL			0xC0000000
+#define	MFI_STATE_FAULT				0xF0000000
 
 #define	MRMFI_FRAME_SIZE			64
 
@@ -148,7 +211,7 @@ extern "C" {
 #define	MFI_FRAME_DIR_WRITE			0x0008
 #define	MFI_FRAME_DIR_READ			0x0010
 #define	MFI_FRAME_DIR_BOTH			0x0018
-#define		MFI_FRAME_IEEE			0x0020
+#define	MFI_FRAME_IEEE				0x0020
 
 /*
  * Definition for cmd_status
@@ -182,12 +245,12 @@ extern "C" {
 #define	MR_DCMD_CTRL_EVENT_GET			0x01040300
 #define	MR_DCMD_CTRL_EVENT_WAIT			0x01040500
 #define	MR_DCMD_LD_GET_PROPERTIES		0x03030000
-#define	MR_DCMD_PD_GET_INFO			0x02020000
 
 /*
  * Solaris Specific MAX values
  */
 #define	MAX_SGL					24
+
 /*
  * MFI command completion codes
  */
@@ -244,7 +307,6 @@ enum MFI_STAT {
 	MFI_STAT_TIME_NOT_SET			= 0x31,
 	MFI_STAT_WRONG_STATE			= 0x32,
 	MFI_STAT_LD_OFFLINE			= 0x33,
-	/* UNUSED: 0x34 to 0xfe */
 	MFI_STAT_INVALID_STATUS			= 0xFF
 };
 
@@ -270,11 +332,34 @@ enum MR_EVT_LOCALE {
 	MR_EVT_LOCALE_ALL		= 0xffff
 };
 
+enum MR_EVT_ARGS {
+	MR_EVT_ARGS_NONE,
+	MR_EVT_ARGS_CDB_SENSE,
+	MR_EVT_ARGS_LD,
+	MR_EVT_ARGS_LD_COUNT,
+	MR_EVT_ARGS_LD_LBA,
+	MR_EVT_ARGS_LD_OWNER,
+	MR_EVT_ARGS_LD_LBA_PD_LBA,
+	MR_EVT_ARGS_LD_PROG,
+	MR_EVT_ARGS_LD_STATE,
+	MR_EVT_ARGS_LD_STRIP,
+	MR_EVT_ARGS_PD,
+	MR_EVT_ARGS_PD_ERR,
+	MR_EVT_ARGS_PD_LBA,
+	MR_EVT_ARGS_PD_LBA_LD,
+	MR_EVT_ARGS_PD_PROG,
+	MR_EVT_ARGS_PD_STATE,
+	MR_EVT_ARGS_PCI,
+	MR_EVT_ARGS_RATE,
+	MR_EVT_ARGS_STR,
+	MR_EVT_ARGS_TIME,
+	MR_EVT_ARGS_ECC
+};
+
 #define	MR_EVT_CFG_CLEARED		0x0004
 #define	MR_EVT_LD_CREATED		0x008a
 #define	MR_EVT_LD_DELETED		0x008b
-#define	MR_EVT_PD_REMOVED_EXT		0x00f8
-#define	MR_EVT_PD_INSERTED_EXT		0x00f7
+#define	MR_EVT_CFG_FP_CHANGE		0x017B
 
 enum LD_STATE {
 	LD_OFFLINE		= 0,
@@ -302,6 +387,7 @@ enum MRSAS_EVT {
  * @param dma_handle	: dma handle
  * @param dma_cookie	: scatter-gather list
  * @param dma_attr	: dma attributes for this buffer
+ *
  * Our DMA object. The caller must initialize the size and dma attributes
  * (dma_attr) fields before allocating the resources.
  */
@@ -321,23 +407,26 @@ struct mrsas_eventinfo {
 	int 			tgt;
 	int 			lun;
 	int 			event;
+	uint64_t		wwn;
 };
 
 struct mrsas_ld {
 	dev_info_t		*dip;
 	uint8_t 		lun_type;
-	uint8_t 		reserved[3];
+	uint8_t			flag;
+	uint8_t 		reserved[2];
 };
 
-struct mrsas_pd {
+
+#ifdef PDSUPPORT
+struct mrsas_tbolt_pd {
 	dev_info_t		*dip;
 	uint8_t 		lun_type;
 	uint8_t 		dev_id;
-	uint8_t 		flags;
+	uint8_t 		flag;
 	uint8_t 		reserved;
 };
-
-struct mrsas_pd_info {
+struct mrsas_tbolt_pd_info {
 	uint16_t	deviceId;
 	uint16_t	seqNum;
 	uint8_t		inquiryData[96];
@@ -363,6 +452,7 @@ struct mrsas_pd_info {
 		uint8_t	reserved2[16];
 	} pathInfo;
 };
+#endif
 
 typedef struct mrsas_instance {
 	uint32_t	*producer;
@@ -372,6 +462,12 @@ typedef struct mrsas_instance {
 	dma_obj_t	mfi_internal_dma_obj;
 	uint16_t	adapterresetinprogress;
 	uint16_t	deadadapter;
+	/* ThunderBolt (TB) specific */
+	dma_obj_t	mpi2_frame_pool_dma_obj;
+	dma_obj_t	request_desc_dma_obj;
+	dma_obj_t	reply_desc_dma_obj;
+	dma_obj_t	ld_map_obj[2];
+
 	uint8_t		init_id;
 	uint8_t		flag_ieee;
 	uint8_t		disable_online_ctrl_reset;
@@ -382,11 +478,17 @@ typedef struct mrsas_instance {
 	uint32_t	max_sectors_per_req;
 
 	struct mrsas_cmd **cmd_list;
+
 	mlist_t		cmd_pool_list;
 	kmutex_t	cmd_pool_mtx;
+	kmutex_t	sync_map_mtx;
 
 	mlist_t		app_cmd_pool_list;
 	kmutex_t	app_cmd_pool_mtx;
+	mlist_t		cmd_app_pool_list;
+	kmutex_t	cmd_app_pool_mtx;
+
+
 	mlist_t		cmd_pend_list;
 	kmutex_t	cmd_pend_mtx;
 
@@ -407,6 +509,9 @@ typedef struct mrsas_instance {
 	kcondvar_t	abort_cmd_cv;
 	kmutex_t	abort_cmd_mtx;
 
+	kmutex_t	reg_write_mtx;
+	kmutex_t	chip_mtx;
+
 	dev_info_t		*dip;
 	ddi_acc_handle_t	pci_handle;
 
@@ -420,6 +525,7 @@ typedef struct mrsas_instance {
 	ddi_iblock_cookie_t	soft_iblock_cookie;
 	ddi_softintr_t		soft_intr_id;
 	uint8_t		softint_running;
+	uint8_t		tbolt_softint_running;
 	kmutex_t	completed_pool_mtx;
 	mlist_t		completed_pool_list;
 
@@ -436,23 +542,99 @@ typedef struct mrsas_instance {
 	char		iocnode[16];
 
 	int		fm_capabilities;
+	/*
+	 * Driver resources unroll flags.  The flag is set for resources that
+	 * are needed to be free'd at detach() time.
+	 */
+	struct _unroll {
+		uint8_t softs;		/* The software state was allocated. */
+		uint8_t regs;		/* Controller registers mapped. */
+		uint8_t intr;		/* Interrupt handler added. */
+		uint8_t reqs;		/* Request structs allocated. */
+		uint8_t mutexs;		/* Mutex's allocated. */
+		uint8_t taskq;		/* Task q's created. */
+		uint8_t tran;		/* Tran struct allocated */
+		uint8_t tranSetup;	/* Tran attached to the ddi. */
+		uint8_t devctl;		/* Device nodes for cfgadm created. */
+		uint8_t scsictl;	/* Device nodes for cfgadm created. */
+		uint8_t ioctl;		/* Device nodes for ioctl's created. */
+		uint8_t timer;		/* Timer started. */
+		uint8_t aenPend;	/* AEN cmd pending f/w. */
+		uint8_t mapUpdate_pend; /* LD MAP update cmd pending f/w. */
+		uint8_t soft_isr;
+		uint8_t ldlist_buff;
+		uint8_t pdlist_buff;
+		uint8_t syncCmd;
+		uint8_t verBuff;
+		uint8_t alloc_space_mfi;
+		uint8_t alloc_space_mpi2;
+	} unroll;
+
+
+	/* function template pointer */
+	struct mrsas_function_template *func_ptr;
+
 
-	struct mrsas_func_ptr *func_ptr;
 	/* MSI interrupts specific */
-	ddi_intr_handle_t *intr_htable;
+	ddi_intr_handle_t *intr_htable;		/* Interrupt handle array */
+	size_t		intr_htable_size;	/* Int. handle array size */
 	int		intr_type;
 	int		intr_cnt;
-	size_t		intr_size;
 	uint_t		intr_pri;
 	int		intr_cap;
 
 	ddi_taskq_t	*taskq;
 	struct mrsas_ld	*mr_ld_list;
+	kmutex_t	config_dev_mtx;
+	/* ThunderBolt (TB) specific */
+	ddi_softintr_t	tbolt_soft_intr_id;
+
+#ifdef PDSUPPORT
+	uint32_t	mr_tbolt_pd_max;
+	struct mrsas_tbolt_pd *mr_tbolt_pd_list;
+#endif
+
+	uint8_t		fast_path_io;
+
+	uint16_t	tbolt;
+	uint16_t	reply_read_index;
+	uint16_t	reply_size; 		/* Single Reply struct size */
+	uint16_t	raid_io_msg_size; 	/* Single message size */
+	uint32_t	io_request_frames_phy;
+	uint8_t 	*io_request_frames;
+	/* Virtual address of request desc frame pool */
+	MRSAS_REQUEST_DESCRIPTOR_UNION	*request_message_pool;
+	/* Physical address of request desc frame pool */
+	uint32_t	request_message_pool_phy;
+	/* Virtual address of reply Frame */
+	MPI2_REPLY_DESCRIPTORS_UNION	*reply_frame_pool;
+	/* Physical address of reply Frame */
+	uint32_t	reply_frame_pool_phy;
+	uint8_t		*reply_pool_limit;	/* Last reply frame address */
+	/* Physical address of Last reply frame */
+	uint32_t	reply_pool_limit_phy;
+	uint32_t	reply_q_depth;		/* Reply Queue Depth */
+	uint8_t		max_sge_in_main_msg;
+	uint8_t		max_sge_in_chain;
+	uint8_t    	chain_offset_io_req;
+	uint8_t		chain_offset_mpt_msg;
+	MR_FW_RAID_MAP_ALL *ld_map[2];
+	uint32_t 	ld_map_phy[2];
+	uint32_t	size_map_info;
+	uint64_t 	map_id;
+	LD_LOAD_BALANCE_INFO load_balance_info[MAX_LOGICAL_DRIVES];
+	struct mrsas_cmd *map_update_cmd;
+	uint32_t	SyncRequired;
 	kmutex_t	ocr_flags_mtx;
+	dma_obj_t	drv_ver_dma_obj;
 } mrsas_t;
 
-struct mrsas_func_ptr {
-	int (*read_fw_status_reg)(struct mrsas_instance *);
+
+/*
+ * Function templates for various controller specific functions
+ */
+struct mrsas_function_template {
+	uint32_t (*read_fw_status_reg)(struct mrsas_instance *);
 	void (*issue_cmd)(struct mrsas_cmd *, struct mrsas_instance *);
 	int (*issue_cmd_in_sync_mode)(struct mrsas_instance *,
 	    struct mrsas_cmd *);
@@ -461,6 +643,8 @@ struct mrsas_func_ptr {
 	void (*enable_intr)(struct mrsas_instance *);
 	void (*disable_intr)(struct mrsas_instance *);
 	int (*intr_ack)(struct mrsas_instance *);
+	int (*init_adapter)(struct mrsas_instance *);
+/*	int (*reset_adapter)(struct mrsas_instance *); */
 };
 
 /*
@@ -480,13 +664,11 @@ struct mrsas_func_ptr {
  * console messages debug levels
  */
 #define	CL_NONE		0	/* No debug information */
-#define	CL_TEST_OCR	1
-#define	CL_ANN		2	/* print unconditionally, announcements */
-#define	CL_ANN1		3	/* No o/p  */
-#define	CL_DLEVEL1	4	/* debug level 1, informative */
-#define	CL_DLEVEL2	5	/* debug level 2, verbose */
-#define	CL_DLEVEL3	6	/* debug level 3, very verbose */
-
+#define	CL_ANN		1	/* print unconditionally, announcements */
+#define	CL_ANN1		2	/* No o/p  */
+#define	CL_DLEVEL1	3	/* debug level 1, informative */
+#define	CL_DLEVEL2	4	/* debug level 2, verbose */
+#define	CL_DLEVEL3	5	/* debug level 3, very verbose */
 
 #ifdef __SUNPRO_C
 #define	__func__ ""
@@ -547,9 +729,9 @@ struct mrsas_func_ptr {
 #define	HIGH_LEVEL_INTR			1
 #define	NORMAL_LEVEL_INTR		0
 
+#define		IO_TIMEOUT_VAL		0
 #define		IO_RETRY_COUNT		3
 #define		MAX_FW_RESET_COUNT	3
-
 /*
  * scsa_cmd  - Per-command mr private data
  * @param cmd_dmahandle		:  dma handle
@@ -598,13 +780,20 @@ struct scsa_cmd {
 
 
 struct mrsas_cmd {
+	/*
+	 * ThunderBolt(TB) We would be needing to have a placeholder
+	 * for RAID_MSG_IO_REQUEST inside this structure. We are
+	 * supposed to embed the mr_frame inside the RAID_MSG and post
+	 * it down to the firmware.
+	 */
 	union mrsas_frame	*frame;
 	uint32_t		frame_phys_addr;
 	uint8_t			*sense;
+	uint8_t			*sense1;
 	uint32_t		sense_phys_addr;
+	uint32_t		sense_phys_addr1;
 	dma_obj_t		frame_dma_obj;
 	uint8_t			frame_dma_obj_status;
-
 	uint32_t		index;
 	uint8_t			sync_cmd;
 	uint8_t			cmd_status;
@@ -613,8 +802,16 @@ struct mrsas_cmd {
 	uint32_t		frame_count;
 	struct scsa_cmd		*cmd;
 	struct scsi_pkt		*pkt;
+	Mpi2RaidSCSIIORequest_t *scsi_io_request;
+	Mpi2SGEIOUnion_t	*sgl;
+	uint32_t		sgl_phys_addr;
+	uint32_t		scsi_io_request_phys_addr;
+	MRSAS_REQUEST_DESCRIPTOR_UNION	*request_desc;
+	uint16_t		SMID;
 	uint16_t		retry_count_for_ocr;
 	uint16_t		drv_pkt_time;
+	uint16_t		load_balance_flag;
+
 };
 
 #define	MAX_MGMT_ADAPTERS			1024
@@ -637,8 +834,8 @@ struct mrsas_mgmt_info {
 	int				max_index;
 };
 
-#pragma pack(1)
 
+#pragma pack(1)
 /*
  * SAS controller properties
  */
@@ -662,6 +859,7 @@ struct mrsas_ctrl_prop {
 	uint8_t		cluster_enable;
 	uint8_t		coercion_mode;
 	uint8_t		alarm_enable;
+
 	uint8_t		reserved_1[13];
 	uint32_t	on_off_properties;
 	uint8_t		reserved_4[28];
@@ -867,12 +1065,15 @@ struct mrsas_ctrl_info {
 
 #define	MRSAS_IOCTL_CMD				0
 
+#define	MRDRV_TGT_VALID				1
+
 /*
  * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
  * SGLs based on the size of dma_addr_t
  */
 #define	IS_DMA64		(sizeof (dma_addr_t) == 8)
 
+#define	RESERVED0_REGISTER		0x00	/* XScale */
 #define	IB_MSG_0_OFF			0x10	/* XScale */
 #define	OB_MSG_0_OFF			0x18	/* XScale */
 #define	IB_DOORBELL_OFF			0x20	/* XScale & ROC */
@@ -883,13 +1084,18 @@ struct mrsas_ctrl_info {
 #define	OB_SCRATCH_PAD_0_OFF		0xB0	/* ROC */
 #define	OB_INTR_MASK			0xFFFFFFFF
 #define	OB_DOORBELL_CLEAR_MASK		0xFFFFFFFF
-#define		WRITE_SEQ_OFF			0x000000FC
-#define		HOST_DIAG_OFF			0x000000F8
-#define		DIAG_RESET_ADAPTER		0x00000004
-#define		DIAG_WRITE_ENABLE		0x00000080
-/*
- * All MFI register set macros accept mrsas_register_set*
- */
+#define	SYSTOIOP_INTERRUPT_MASK		0x80000000
+#define	OB_SCRATCH_PAD_2_OFF		0xB4
+#define	WRITE_TBOLT_SEQ_OFF		0x00000004
+#define	DIAG_TBOLT_RESET_ADAPTER	0x00000004
+#define	HOST_TBOLT_DIAG_OFF		0x00000008
+#define	RESET_TBOLT_STATUS_OFF		0x000003C3
+#define	WRITE_SEQ_OFF			0x000000FC
+#define	HOST_DIAG_OFF			0x000000F8
+#define	DIAG_RESET_ADAPTER		0x00000004
+#define	DIAG_WRITE_ENABLE		0x00000080
+#define	SYSTOIOP_INTERRUPT_MASK		0x80000000
+
 #define	WR_IB_WRITE_SEQ(v, instance) 	ddi_put32((instance)->regmap_handle, \
 	(uint32_t *)((uintptr_t)(instance)->regmap + WRITE_SEQ_OFF), (v))
 
@@ -899,6 +1105,13 @@ struct mrsas_ctrl_info {
 #define	WR_IB_DRWE(v, instance) 	ddi_put32((instance)->regmap_handle, \
 	(uint32_t *)((uintptr_t)(instance)->regmap + HOST_DIAG_OFF), (v))
 
+#define	IB_LOW_QPORT			0xC0
+#define	IB_HIGH_QPORT			0xC4
+#define	OB_DOORBELL_REGISTER		0x9C	/* 1078 implementation */
+
+/*
+ * All MFI register set macros accept mrsas_register_set*
+ */
 #define	WR_IB_MSG_0(v, instance) 	ddi_put32((instance)->regmap_handle, \
 	(uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
 
@@ -933,6 +1146,56 @@ struct mrsas_ctrl_info {
 #define	RD_OB_SCRATCH_PAD_0(instance) 	ddi_get32((instance)->regmap_handle, \
 	(uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
 
+/* Thunderbolt specific registers */
+#define	RD_OB_SCRATCH_PAD_2(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_2_OFF))
+
+#define	WR_TBOLT_IB_WRITE_SEQ(v, instance) \
+	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + WRITE_TBOLT_SEQ_OFF), (v))
+
+#define	RD_TBOLT_HOST_DIAG(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + HOST_TBOLT_DIAG_OFF))
+
+#define	WR_TBOLT_HOST_DIAG(v, instance)	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + HOST_TBOLT_DIAG_OFF), (v))
+
+#define	RD_TBOLT_RESET_STAT(instance)	ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + RESET_TBOLT_STATUS_OFF))
+
+
+#define	WR_MPI2_REPLY_POST_INDEX(v, instance)\
+	ddi_put32((instance)->regmap_handle,\
+	(uint32_t *)\
+	((uintptr_t)(instance)->regmap + MPI2_REPLY_POST_HOST_INDEX_OFFSET),\
+	(v))
+
+
+#define	RD_MPI2_REPLY_POST_INDEX(instance)\
+	ddi_get32((instance)->regmap_handle,\
+	(uint32_t *)\
+	((uintptr_t)(instance)->regmap + MPI2_REPLY_POST_HOST_INDEX_OFFSET))
+
+#define	WR_IB_LOW_QPORT(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_LOW_QPORT), (v))
+
+#define	WR_IB_HIGH_QPORT(v, instance) 	ddi_put32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + IB_HIGH_QPORT), (v))
+
+#define	WR_OB_DOORBELL_REGISTER_CLEAR(v, instance)\
+	ddi_put32((instance)->regmap_handle,\
+	(uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_REGISTER), \
+	(v))
+
+#define	WR_RESERVED0_REGISTER(v, instance) ddi_put32((instance)->regmap_handle,\
+	(uint32_t *)((uintptr_t)(instance)->regmap + RESERVED0_REGISTER), \
+	(v))
+
+#define	RD_RESERVED0_REGISTER(instance) ddi_get32((instance)->regmap_handle, \
+	(uint32_t *)((uintptr_t)(instance)->regmap + RESERVED0_REGISTER))
+
+
+
 /*
  * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
  * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
@@ -948,6 +1211,9 @@ struct mrsas_ctrl_info {
 #define	MFI_REPLY_2108_MESSAGE_INTR		0x00000001
 #define	MFI_REPLY_2108_MESSAGE_INTR_MASK	0x00000005
 
+/* Fusion interrupt mask */
+#define	MFI_FUSION_ENABLE_INTERRUPT_MASK	(0x00000008)
+
 #define	MFI_POLL_TIMEOUT_SECS		60
 
 #define	MFI_ENABLE_INTR(instance)  ddi_put32((instance)->regmap_handle, \
@@ -973,45 +1239,45 @@ struct mrsas_ctrl_info {
  * on_off_property of mrsas_ctrl_prop
  * bit0-9, 11-31 are reserved
  */
-#define	DISABLE_OCR_PROP_FLAG	0x00000400 /* bit 10 */
+#define	DISABLE_OCR_PROP_FLAG   0x00000400 /* bit 10 */
 
 struct mrsas_register_set {
-	uint32_t	reserved_0[4];
+	uint32_t	reserved_0[4];			/* 0000h */
 
-	uint32_t	inbound_msg_0;
-	uint32_t	inbound_msg_1;
-	uint32_t	outbound_msg_0;
-	uint32_t	outbound_msg_1;
+	uint32_t	inbound_msg_0;			/* 0010h */
+	uint32_t	inbound_msg_1;			/* 0014h */
+	uint32_t	outbound_msg_0;			/* 0018h */
+	uint32_t	outbound_msg_1;			/* 001Ch */
 
-	uint32_t	inbound_doorbell;
-	uint32_t	inbound_intr_status;
-	uint32_t	inbound_intr_mask;
+	uint32_t	inbound_doorbell;		/* 0020h */
+	uint32_t	inbound_intr_status;		/* 0024h */
+	uint32_t	inbound_intr_mask;		/* 0028h */
 
-	uint32_t	outbound_doorbell;
-	uint32_t	outbound_intr_status;
-	uint32_t	outbound_intr_mask;
+	uint32_t	outbound_doorbell;		/* 002Ch */
+	uint32_t	outbound_intr_status;		/* 0030h */
+	uint32_t	outbound_intr_mask;		/* 0034h */
 
-	uint32_t	reserved_1[2];
+	uint32_t	reserved_1[2];			/* 0038h */
 
-	uint32_t	inbound_queue_port;
-	uint32_t	outbound_queue_port;
+	uint32_t	inbound_queue_port;		/* 0040h */
+	uint32_t	outbound_queue_port;		/* 0044h */
 
-	uint32_t 	reserved_2[22];
+	uint32_t 	reserved_2[22];			/* 0048h */
 
-	uint32_t 	outbound_doorbell_clear;
+	uint32_t 	outbound_doorbell_clear;	/* 00A0h */
 
-	uint32_t 	reserved_3[3];
+	uint32_t 	reserved_3[3];			/* 00A4h */
 
-	uint32_t 	outbound_scratch_pad;
+	uint32_t 	outbound_scratch_pad;		/* 00B0h */
 
-	uint32_t 	reserved_4[3];
+	uint32_t 	reserved_4[3];			/* 00B4h */
 
-	uint32_t 	inbound_low_queue_port;
+	uint32_t 	inbound_low_queue_port;		/* 00C0h */
 
-	uint32_t 	inbound_high_queue_port;
+	uint32_t 	inbound_high_queue_port;	/* 00C4h */
 
-	uint32_t 	reserved_5;
-	uint32_t 	index_registers[820];
+	uint32_t 	reserved_5;			/* 00C8h */
+	uint32_t 	index_registers[820];		/* 00CCh */
 };
 
 struct mrsas_sge32 {
@@ -1037,24 +1303,24 @@ union mrsas_sgl {
 };
 
 struct mrsas_header {
-	uint8_t		cmd;
-	uint8_t		sense_len;
-	uint8_t		cmd_status;
-	uint8_t		scsi_status;
-
-	uint8_t		target_id;
-	uint8_t		lun;
-	uint8_t		cdb_len;
-	uint8_t		sge_count;
-
-	uint32_t	context;
-	uint8_t		req_id;
-	uint8_t		msgvector;
-	uint16_t	pad_0;
-
-	uint16_t	flags;
-	uint16_t	timeout;
-	uint32_t	data_xferlen;
+	uint8_t		cmd;				/* 00h */
+	uint8_t		sense_len;			/* 01h */
+	uint8_t		cmd_status;			/* 02h */
+	uint8_t		scsi_status;			/* 03h */
+
+	uint8_t		target_id;			/* 04h */
+	uint8_t		lun;				/* 05h */
+	uint8_t		cdb_len;			/* 06h */
+	uint8_t		sge_count;			/* 07h */
+
+	uint32_t	context;			/* 08h */
+	uint8_t		req_id;				/* 0Ch */
+	uint8_t		msgvector;			/* 0Dh */
+	uint16_t	pad_0;				/* 0Eh */
+
+	uint16_t	flags;				/* 10h */
+	uint16_t	timeout;			/* 12h */
+	uint32_t	data_xferlen;			/* 14h */
 };
 
 union mrsas_sgl_frame {
@@ -1063,198 +1329,199 @@ union mrsas_sgl_frame {
 };
 
 struct mrsas_init_frame {
-	uint8_t		cmd;
-	uint8_t		reserved_0;
-	uint8_t		cmd_status;
-
-	uint8_t		reserved_1;
-	uint32_t	reserved_2;
-
-	uint32_t	context;
-	uint8_t		req_id;
-	uint8_t		msgvector;
-	uint16_t	pad_0;
-
-	uint16_t	flags;
-	uint16_t	reserved_3;
-	uint32_t	data_xfer_len;
-
-	uint32_t	queue_info_new_phys_addr_lo;
-	uint32_t	queue_info_new_phys_addr_hi;
-	uint32_t	queue_info_old_phys_addr_lo;
-	uint32_t	queue_info_old_phys_addr_hi;
-
-	uint32_t	reserved_4[6];
+	uint8_t		cmd;				/* 00h */
+	uint8_t		reserved_0;			/* 01h */
+	uint8_t		cmd_status;			/* 02h */
+
+	uint8_t		reserved_1;			/* 03h */
+	uint32_t	reserved_2;			/* 04h */
+
+	uint32_t	context;			/* 08h */
+	uint8_t		req_id;				/* 0Ch */
+	uint8_t		msgvector;			/* 0Dh */
+	uint16_t	pad_0;				/* 0Eh */
+
+	uint16_t	flags;				/* 10h */
+	uint16_t	reserved_3;			/* 12h */
+	uint32_t	data_xfer_len;			/* 14h */
+
+	uint32_t	queue_info_new_phys_addr_lo;	/* 18h */
+	uint32_t	queue_info_new_phys_addr_hi;	/* 1Ch */
+	uint32_t	queue_info_old_phys_addr_lo;	/* 20h */
+	uint32_t	queue_info_old_phys_addr_hi;	/* 24h */
+	uint64_t 	driverversion;			/* 28h */
+	uint32_t	reserved_4[4];			/* 30h */
 };
 
 struct mrsas_init_queue_info {
-	uint32_t		init_flags;
-	uint32_t		reply_queue_entries;
-
-	uint32_t		reply_queue_start_phys_addr_lo;
-	uint32_t		reply_queue_start_phys_addr_hi;
-	uint32_t		producer_index_phys_addr_lo;
-	uint32_t		producer_index_phys_addr_hi;
-	uint32_t		consumer_index_phys_addr_lo;
-	uint32_t		consumer_index_phys_addr_hi;
+	uint32_t		init_flags;			/* 00h */
+	uint32_t		reply_queue_entries;		/* 04h */
+
+	uint32_t		reply_queue_start_phys_addr_lo;	/* 08h */
+	uint32_t		reply_queue_start_phys_addr_hi;	/* 0Ch */
+	uint32_t		producer_index_phys_addr_lo;	/* 10h */
+	uint32_t		producer_index_phys_addr_hi;	/* 14h */
+	uint32_t		consumer_index_phys_addr_lo;	/* 18h */
+	uint32_t		consumer_index_phys_addr_hi;	/* 1Ch */
 };
 
 struct mrsas_io_frame {
-	uint8_t			cmd;
-	uint8_t			sense_len;
-	uint8_t			cmd_status;
-	uint8_t			scsi_status;
+	uint8_t			cmd;			/* 00h */
+	uint8_t			sense_len;		/* 01h */
+	uint8_t			cmd_status;		/* 02h */
+	uint8_t			scsi_status;		/* 03h */
 
-	uint8_t			target_id;
-	uint8_t			access_byte;
-	uint8_t			reserved_0;
-	uint8_t			sge_count;
+	uint8_t			target_id;		/* 04h */
+	uint8_t			access_byte;		/* 05h */
+	uint8_t			reserved_0;		/* 06h */
+	uint8_t			sge_count;		/* 07h */
 
-	uint32_t		context;
-	uint8_t			req_id;
-	uint8_t			msgvector;
-	uint16_t		pad_0;
+	uint32_t		context;		/* 08h */
+	uint8_t			req_id;			/* 0Ch */
+	uint8_t			msgvector;		/* 0Dh */
+	uint16_t		pad_0;			/* 0Eh */
 
-	uint16_t		flags;
-	uint16_t		timeout;
-	uint32_t		lba_count;
+	uint16_t		flags;			/* 10h */
+	uint16_t		timeout;		/* 12h */
+	uint32_t		lba_count;		/* 14h */
 
-	uint32_t		sense_buf_phys_addr_lo;
-	uint32_t		sense_buf_phys_addr_hi;
+	uint32_t		sense_buf_phys_addr_lo;	/* 18h */
+	uint32_t		sense_buf_phys_addr_hi;	/* 1Ch */
 
-	uint32_t		start_lba_lo;
-	uint32_t		start_lba_hi;
+	uint32_t		start_lba_lo;		/* 20h */
+	uint32_t		start_lba_hi;		/* 24h */
 
-	union mrsas_sgl		sgl;
+	union mrsas_sgl		sgl;			/* 28h */
 };
 
 struct mrsas_pthru_frame {
-	uint8_t			cmd;
-	uint8_t			sense_len;
-	uint8_t			cmd_status;
-	uint8_t			scsi_status;
-
-	uint8_t			target_id;
-	uint8_t			lun;
-	uint8_t			cdb_len;
-	uint8_t			sge_count;
-
-	uint32_t		context;
-	uint8_t			req_id;
-	uint8_t			msgvector;
-	uint16_t		pad_0;
-
-	uint16_t		flags;
-	uint16_t		timeout;
-	uint32_t		data_xfer_len;
-
-	uint32_t		sense_buf_phys_addr_lo;
-	uint32_t		sense_buf_phys_addr_hi;
-
-	uint8_t			cdb[16];
-	union mrsas_sgl		sgl;
+	uint8_t			cmd;			/* 00h */
+	uint8_t			sense_len;		/* 01h */
+	uint8_t			cmd_status;		/* 02h */
+	uint8_t			scsi_status;		/* 03h */
+
+	uint8_t			target_id;		/* 04h */
+	uint8_t			lun;			/* 05h */
+	uint8_t			cdb_len;		/* 06h */
+	uint8_t			sge_count;		/* 07h */
+
+	uint32_t		context;		/* 08h */
+	uint8_t			req_id;			/* 0Ch */
+	uint8_t			msgvector;		/* 0Dh */
+	uint16_t		pad_0;			/* 0Eh */
+
+	uint16_t		flags;			/* 10h */
+	uint16_t		timeout;		/* 12h */
+	uint32_t		data_xfer_len;		/* 14h */
+
+	uint32_t		sense_buf_phys_addr_lo;	/* 18h */
+	uint32_t		sense_buf_phys_addr_hi;	/* 1Ch */
+
+	uint8_t			cdb[16];		/* 20h */
+	union mrsas_sgl		sgl;			/* 30h */
 };
 
 struct mrsas_dcmd_frame {
-	uint8_t			cmd;
-	uint8_t			reserved_0;
-	uint8_t			cmd_status;
-	uint8_t			reserved_1[4];
-	uint8_t			sge_count;
+	uint8_t			cmd;			/* 00h */
+	uint8_t			reserved_0;		/* 01h */
+	uint8_t			cmd_status;		/* 02h */
+	uint8_t			reserved_1[4];		/* 03h */
+	uint8_t			sge_count;		/* 07h */
 
-	uint32_t		context;
-	uint8_t			req_id;
-	uint8_t			msgvector;
-	uint16_t		pad_0;
+	uint32_t		context;		/* 08h */
+	uint8_t			req_id;			/* 0Ch */
+	uint8_t			msgvector;		/* 0Dh */
+	uint16_t		pad_0;			/* 0Eh */
 
-	uint16_t		flags;
-	uint16_t		timeout;
+	uint16_t		flags;			/* 10h */
+	uint16_t		timeout;		/* 12h */
 
-	uint32_t		data_xfer_len;
-	uint32_t		opcode;
+	uint32_t		data_xfer_len;		/* 14h */
+	uint32_t		opcode;			/* 18h */
 
-	union {
+	/* uint8_t		mbox[DCMD_MBOX_SZ]; */	/* 1Ch */
+	union {						/* 1Ch */
 		uint8_t b[DCMD_MBOX_SZ];
 		uint16_t s[6];
 		uint32_t w[3];
 	} mbox;
 
-	union mrsas_sgl		sgl;
+	union mrsas_sgl		sgl;			/* 28h */
 };
 
 struct mrsas_abort_frame {
-	uint8_t		cmd;
-	uint8_t		reserved_0;
-	uint8_t		cmd_status;
+	uint8_t		cmd;				/* 00h */
+	uint8_t		reserved_0;			/* 01h */
+	uint8_t		cmd_status;			/* 02h */
 
-	uint8_t		reserved_1;
-	uint32_t	reserved_2;
+	uint8_t		reserved_1;			/* 03h */
+	uint32_t	reserved_2;			/* 04h */
 
-	uint32_t	context;
-	uint8_t		req_id;
-	uint8_t		msgvector;
-	uint16_t	pad_0;
+	uint32_t	context;			/* 08h */
+	uint8_t		req_id;				/* 0Ch */
+	uint8_t		msgvector;			/* 0Dh */
+	uint16_t	pad_0;				/* 0Eh */
 
-	uint16_t	flags;
-	uint16_t	reserved_3;
-	uint32_t	reserved_4;
+	uint16_t	flags;				/* 10h */
+	uint16_t	reserved_3;			/* 12h */
+	uint32_t	reserved_4;			/* 14h */
 
-	uint32_t	abort_context;
-	uint32_t	pad_1;
+	uint32_t	abort_context;			/* 18h */
+	uint32_t	pad_1;				/* 1Ch */
 
-	uint32_t	abort_mfi_phys_addr_lo;
-	uint32_t	abort_mfi_phys_addr_hi;
+	uint32_t	abort_mfi_phys_addr_lo;		/* 20h */
+	uint32_t	abort_mfi_phys_addr_hi;		/* 24h */
 
-	uint32_t	reserved_5[6];
+	uint32_t	reserved_5[6];			/* 28h */
 };
 
 struct mrsas_smp_frame {
-	uint8_t		cmd;
-	uint8_t		reserved_1;
-	uint8_t		cmd_status;
-	uint8_t		connection_status;
+	uint8_t		cmd;				/* 00h */
+	uint8_t		reserved_1;			/* 01h */
+	uint8_t		cmd_status;			/* 02h */
+	uint8_t		connection_status;		/* 03h */
 
-	uint8_t		reserved_2[3];
-	uint8_t		sge_count;
+	uint8_t		reserved_2[3];			/* 04h */
+	uint8_t		sge_count;			/* 07h */
 
-	uint32_t	context;
-	uint8_t		req_id;
-	uint8_t		msgvector;
-	uint16_t	pad_0;
+	uint32_t	context;			/* 08h */
+	uint8_t		req_id;				/* 0Ch */
+	uint8_t		msgvector;			/* 0Dh */
+	uint16_t	pad_0;				/* 0Eh */
 
-	uint16_t	flags;
-	uint16_t	timeout;
+	uint16_t	flags;				/* 10h */
+	uint16_t	timeout;			/* 12h */
 
-	uint32_t	data_xfer_len;
+	uint32_t	data_xfer_len;			/* 14h */
 
-	uint64_t	sas_addr;
+	uint64_t	sas_addr;			/* 20h */
 
-	union mrsas_sgl	sgl[2];
+	union mrsas_sgl	sgl[2];				/* 28h */
 };
 
 struct mrsas_stp_frame {
-	uint8_t		cmd;
-	uint8_t		reserved_1;
-	uint8_t		cmd_status;
-	uint8_t		connection_status;
+	uint8_t		cmd;				/* 00h */
+	uint8_t		reserved_1;			/* 01h */
+	uint8_t		cmd_status;			/* 02h */
+	uint8_t		connection_status;		/* 03h */
 
-	uint8_t		target_id;
-	uint8_t		reserved_2[2];
-	uint8_t		sge_count;
+	uint8_t		target_id;			/* 04h */
+	uint8_t		reserved_2[2];			/* 04h */
+	uint8_t		sge_count;			/* 07h */
 
-	uint32_t	context;
-	uint8_t		req_id;
-	uint8_t		msgvector;
-	uint16_t	pad_0;
+	uint32_t	context;			/* 08h */
+	uint8_t		req_id;				/* 0Ch */
+	uint8_t		msgvector;			/* 0Dh */
+	uint16_t	pad_0;				/* 0Eh */
 
-	uint16_t	flags;
-	uint16_t	timeout;
+	uint16_t	flags;				/* 10h */
+	uint16_t	timeout;			/* 12h */
 
-	uint32_t	data_xfer_len;
+	uint32_t	data_xfer_len;			/* 14h */
 
-	uint16_t	fis[10];
-	uint32_t	stp_flags;
-	union mrsas_sgl	sgl;
+	uint16_t	fis[10];			/* 28h */
+	uint32_t	stp_flags;			/* 3C */
+	union mrsas_sgl	sgl;				/* 40 */
 };
 
 union mrsas_frame {
@@ -1681,144 +1948,111 @@ struct mrsas_aen {
 	uint32_t	seq_num;
 	uint32_t	class_locale_word;
 };
+
 #pragma pack()
 
 #ifndef	DDI_VENDOR_LSI
 #define	DDI_VENDOR_LSI		"LSI"
 #endif /* DDI_VENDOR_LSI */
 
-#ifndef	KMDB_MODULE
-static int	mrsas_getinfo(dev_info_t *, ddi_info_cmd_t,  void *, void **);
-static int	mrsas_attach(dev_info_t *, ddi_attach_cmd_t);
-#ifdef __sparc
-static int	mrsas_reset(dev_info_t *, ddi_reset_cmd_t);
-#else /* __sparc */
-static int	mrsas_quiesce(dev_info_t *);
-#endif	/* __sparc */
-static int	mrsas_detach(dev_info_t *, ddi_detach_cmd_t);
-static int	mrsas_open(dev_t *, int, int, cred_t *);
-static int	mrsas_close(dev_t, int, int, cred_t *);
-static int	mrsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
-
-static int	mrsas_tran_tgt_init(dev_info_t *, dev_info_t *,
-		    scsi_hba_tran_t *, struct scsi_device *);
-static struct scsi_pkt *mrsas_tran_init_pkt(struct scsi_address *, register
+int 	mrsas_config_scsi_device(struct mrsas_instance *,
+		    struct scsi_device *, dev_info_t **);
+
+#ifdef PDSUPPORT
+int 	mrsas_tbolt_config_pd(struct mrsas_instance *, uint16_t,
+			uint8_t, dev_info_t **);
+#endif
+
+dev_info_t *mrsas_find_child(struct mrsas_instance *, uint16_t,
+			uint8_t);
+int	mrsas_service_evt(struct mrsas_instance *, int, int, int,
+			uint64_t);
+void return_raid_msg_pkt(struct mrsas_instance *, struct mrsas_cmd *);
+struct mrsas_cmd *get_raid_msg_mfi_pkt(struct mrsas_instance *);
+void return_raid_msg_mfi_pkt(struct mrsas_instance *, struct mrsas_cmd *);
+
+int	alloc_space_for_mpi2(struct mrsas_instance *);
+void	fill_up_drv_ver(struct mrsas_drv_ver *dv);
+
+int	mrsas_issue_init_mpi2(struct mrsas_instance *);
+struct scsi_pkt *mrsas_tbolt_tran_init_pkt(struct scsi_address *, register
 		    struct scsi_pkt *, struct buf *, int, int, int, int,
 		    int (*)(), caddr_t);
-static int	mrsas_tran_start(struct scsi_address *,
+int	mrsas_tbolt_tran_start(struct scsi_address *,
 		    register struct scsi_pkt *);
-static int	mrsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
-static int	mrsas_tran_reset(struct scsi_address *, int);
-static int	mrsas_tran_getcap(struct scsi_address *, char *, int);
-static int	mrsas_tran_setcap(struct scsi_address *, char *, int, int);
-static void	mrsas_tran_destroy_pkt(struct scsi_address *,
-		    struct scsi_pkt *);
-static void	mrsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
-static void	mrsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
-static uint_t	mrsas_isr();
-static uint_t	mrsas_softintr();
-
-static int	init_mfi(struct mrsas_instance *);
-static int	mrsas_free_dma_obj(struct mrsas_instance *, dma_obj_t);
-static int	mrsas_alloc_dma_obj(struct mrsas_instance *, dma_obj_t *,
-		    uchar_t);
-static struct mrsas_cmd *get_mfi_pkt(struct mrsas_instance *);
-static void	return_mfi_pkt(struct mrsas_instance *,
+uint32_t tbolt_read_fw_status_reg(struct mrsas_instance *);
+void 	tbolt_issue_cmd(struct mrsas_cmd *, struct mrsas_instance *);
+int	tbolt_issue_cmd_in_poll_mode(struct mrsas_instance *,
 		    struct mrsas_cmd *);
-
-static void	free_space_for_mfi(struct mrsas_instance *);
-static void	free_additional_dma_buffer(struct mrsas_instance *);
-static int	alloc_additional_dma_buffer(struct mrsas_instance *);
-static int	read_fw_status_reg_ppc(struct mrsas_instance *);
-static void	issue_cmd_ppc(struct mrsas_cmd *, struct mrsas_instance *);
-static int	issue_cmd_in_poll_mode_ppc(struct mrsas_instance *,
-		    struct mrsas_cmd *);
-static int	issue_cmd_in_sync_mode_ppc(struct mrsas_instance *,
+int	tbolt_issue_cmd_in_sync_mode(struct mrsas_instance *,
 		    struct mrsas_cmd *);
-static void	enable_intr_ppc(struct mrsas_instance *);
-static void	disable_intr_ppc(struct mrsas_instance *);
-static int	intr_ack_ppc(struct mrsas_instance *);
-static int	mfi_state_transition_to_ready(struct mrsas_instance *);
-static void	destroy_mfi_frame_pool(struct mrsas_instance *);
-static int	create_mfi_frame_pool(struct mrsas_instance *);
-static int	mrsas_dma_alloc(struct mrsas_instance *, struct scsi_pkt *,
+void	tbolt_enable_intr(struct mrsas_instance *);
+void	tbolt_disable_intr(struct mrsas_instance *);
+int	tbolt_intr_ack(struct mrsas_instance *);
+uint_t	mr_sas_tbolt_process_outstanding_cmd(struct mrsas_instance *);
+    uint_t tbolt_softintr();
+int 	mrsas_tbolt_dma(struct mrsas_instance *, uint32_t, int, int (*)());
+int	mrsas_check_dma_handle(ddi_dma_handle_t handle);
+int	mrsas_check_acc_handle(ddi_acc_handle_t handle);
+int	mrsas_dma_alloc(struct mrsas_instance *, struct scsi_pkt *,
 		    struct buf *, int, int (*)());
-static int	mrsas_dma_move(struct mrsas_instance *,
+int	mrsas_dma_move(struct mrsas_instance *,
 			struct scsi_pkt *, struct buf *);
-static void	flush_cache(struct mrsas_instance *instance);
-static void	display_scsi_inquiry(caddr_t);
-static int	start_mfi_aen(struct mrsas_instance *instance);
-static int	handle_drv_ioctl(struct mrsas_instance *instance,
-		    struct mrsas_ioctl *ioctl, int mode);
-static int	handle_mfi_ioctl(struct mrsas_instance *instance,
-		    struct mrsas_ioctl *ioctl, int mode);
-static int	handle_mfi_aen(struct mrsas_instance *instance,
-		    struct mrsas_aen *aen);
-static void	fill_up_drv_ver(struct mrsas_drv_ver *dv);
-static struct mrsas_cmd *build_cmd(struct mrsas_instance *instance,
-		    struct scsi_address *ap, struct scsi_pkt *pkt,
-		    uchar_t *cmd_done);
-#ifndef __sparc
-static int	wait_for_outstanding(struct mrsas_instance *instance);
-#endif  /* __sparc */
-static int	register_mfi_aen(struct mrsas_instance *instance,
-		    uint32_t seq_num, uint32_t class_locale_word);
-static int	issue_mfi_pthru(struct mrsas_instance *instance, struct
-		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
-static int	issue_mfi_dcmd(struct mrsas_instance *instance, struct
-		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
-static int	issue_mfi_smp(struct mrsas_instance *instance, struct
-		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
-static int	issue_mfi_stp(struct mrsas_instance *instance, struct
-		    mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode);
-static int	abort_aen_cmd(struct mrsas_instance *instance,
-		    struct mrsas_cmd *cmd_to_abort);
-
-static int	mrsas_common_check(struct mrsas_instance *instance,
-		    struct  mrsas_cmd *cmd);
-static void	mrsas_fm_init(struct mrsas_instance *instance);
-static void	mrsas_fm_fini(struct mrsas_instance *instance);
-static int	mrsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
-		    const void *);
-static void	mrsas_fm_ereport(struct mrsas_instance *instance,
-		    char *detail);
-static int	mrsas_check_dma_handle(ddi_dma_handle_t handle);
-static int	mrsas_check_acc_handle(ddi_acc_handle_t handle);
-
-static void	mrsas_rem_intrs(struct mrsas_instance *instance);
-static int	mrsas_add_intrs(struct mrsas_instance *instance, int intr_type);
-
-static void	mrsas_tran_tgt_free(dev_info_t *, dev_info_t *,
-		    scsi_hba_tran_t *, struct scsi_device *);
-static int	mrsas_tran_bus_config(dev_info_t *, uint_t,
-		    ddi_bus_config_op_t, void *, dev_info_t **);
-static int	mrsas_parse_devname(char *, int *, int *);
-static int	mrsas_config_all_devices(struct mrsas_instance *);
-static int 	mrsas_config_scsi_device(struct mrsas_instance *,
-		    struct scsi_device *, dev_info_t **);
-static int 	mrsas_config_ld(struct mrsas_instance *, uint16_t,
-				uint8_t, dev_info_t **);
-static dev_info_t *mrsas_find_child(struct mrsas_instance *, uint16_t,
-			uint8_t);
-static int	mrsas_name_node(dev_info_t *, char *, int);
-static void	mrsas_issue_evt_taskq(struct mrsas_eventinfo *);
-static int	mrsas_service_evt(struct mrsas_instance *, int, int, int,
-			uint64_t);
-static int	mrsas_mode_sense_build(struct scsi_pkt *);
-static void	push_pending_mfi_pkt(struct mrsas_instance *,
+int	mrsas_alloc_dma_obj(struct mrsas_instance *, dma_obj_t *,
+		    uchar_t);
+void 	mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *, struct mrsas_cmd *);
+int 	mrsas_dma_alloc_dmd(struct mrsas_instance *, dma_obj_t *);
+void 	tbolt_complete_cmd_in_sync_mode(struct mrsas_instance *,
+	struct mrsas_cmd *);
+int 	alloc_req_rep_desc(struct mrsas_instance *);
+int		mrsas_mode_sense_build(struct scsi_pkt *);
+void		push_pending_mfi_pkt(struct mrsas_instance *,
 			struct mrsas_cmd *);
-static int 	mrsas_issue_init_mfi(struct mrsas_instance *);
-static int 	mrsas_issue_pending_cmds(struct mrsas_instance *);
-static int 	mrsas_print_pending_cmds(struct mrsas_instance *);
-static int  mrsas_complete_pending_cmds(struct mrsas_instance *);
-static int	mrsas_reset_ppc(struct mrsas_instance *);
-static uint32_t mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *);
-static int  mrsas_kill_adapter(struct mrsas_instance *);
-static void io_timeout_checker(void *instance);
-static void complete_cmd_in_sync_mode(struct mrsas_instance *,
-		struct mrsas_cmd *);
-
-#endif	/* KMDB_MODULE */
+int	mrsas_issue_pending_cmds(struct mrsas_instance *);
+int 	mrsas_print_pending_cmds(struct mrsas_instance *);
+int  	mrsas_complete_pending_cmds(struct mrsas_instance *);
+
+int	create_mfi_frame_pool(struct mrsas_instance *);
+void	destroy_mfi_frame_pool(struct mrsas_instance *);
+int 	create_mfi_mpi_frame_pool(struct mrsas_instance *);
+void 	destroy_mfi_mpi_frame_pool(struct mrsas_instance *);
+int 	create_mpi2_frame_pool(struct mrsas_instance *);
+void 	destroy_mpi2_frame_pool(struct mrsas_instance *);
+int	mrsas_free_dma_obj(struct mrsas_instance *, dma_obj_t);
+void 	mrsas_tbolt_free_additional_dma_buffer(struct mrsas_instance *);
+void 	free_req_desc_pool(struct mrsas_instance *);
+void 	free_space_for_mpi2(struct mrsas_instance *);
+void 	mrsas_dump_reply_desc(struct mrsas_instance *);
+void 	tbolt_complete_cmd(struct mrsas_instance *, struct mrsas_cmd *);
+void	display_scsi_inquiry(caddr_t);
+void	service_mfi_aen(struct mrsas_instance *, struct mrsas_cmd *);
+int	mrsas_mode_sense_build(struct scsi_pkt *);
+int 	mrsas_tbolt_get_ld_map_info(struct mrsas_instance *);
+struct mrsas_cmd *mrsas_tbolt_build_poll_cmd(struct mrsas_instance *,
+	struct scsi_address *, struct scsi_pkt *, uchar_t *);
+int	mrsas_tbolt_reset_ppc(struct mrsas_instance *instance);
+void	mrsas_tbolt_kill_adapter(struct mrsas_instance *instance);
+int 	abort_syncmap_cmd(struct mrsas_instance *, struct mrsas_cmd *);
+void	mrsas_tbolt_prepare_cdb(struct mrsas_instance *instance, U8 cdb[],
+    struct IO_REQUEST_INFO *, Mpi2RaidSCSIIORequest_t *, U32);
+
+
+int mrsas_init_adapter_ppc(struct mrsas_instance *instance);
+int mrsas_init_adapter_tbolt(struct mrsas_instance *instance);
+int mrsas_init_adapter(struct mrsas_instance *instance);
+
+int mrsas_alloc_cmd_pool(struct mrsas_instance *instance);
+void mrsas_free_cmd_pool(struct mrsas_instance *instance);
+
+void mrsas_print_cmd_details(struct mrsas_instance *, struct mrsas_cmd *, int);
+struct mrsas_cmd *get_raid_msg_pkt(struct mrsas_instance *);
+
+int mfi_state_transition_to_ready(struct mrsas_instance *);
+
+
+/* FMA functions. */
+int mrsas_common_check(struct mrsas_instance *, struct  mrsas_cmd *);
+void mrsas_fm_ereport(struct mrsas_instance *, char *);
 
 
 #ifdef	__cplusplus
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_list.c b/usr/src/uts/common/io/mr_sas/mr_sas_list.c
new file mode 100644
index 0000000000..62ae374b76
--- /dev/null
+++ b/usr/src/uts/common/io/mr_sas/mr_sas_list.c
@@ -0,0 +1,134 @@
+/*
+ * mr_sas_list.h: header for mr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-20012, LSI Logic Corporation.
+ * All rights reserved.
+ */
+
+/* Copyright 2012 Nexenta Systems, Inc. All rights reserved. */
+
+/*
+ * Extract C functions from LSI-provided mr_sas_list.h such that we can both
+ * be lint-clean and provide a slightly better source organizational model
+ * beyond preprocessor abuse.
+ */
+
+#include "mr_sas_list.h"
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void
+__list_add(struct mlist_head *new, struct mlist_head *prev,
+    struct mlist_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+void
+mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+void
+mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void
+__list_del(struct mlist_head *prev, struct mlist_head *next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+#if 0
+/*
+ * mlist_del - deletes entry from list.
+ * @entry:	the element to delete from the list.
+ * Note:	list_empty on entry does not return true after this, the entry
+ * is in an undefined state.
+ */
+
+void
+mlist_del(struct mlist_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = entry->prev = 0;
+}
+#endif
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+void
+mlist_del_init(struct mlist_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+int
+mlist_empty(struct mlist_head *head)
+{
+	return (head->next == head);
+}
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+void
+mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+	struct mlist_head *first = list->next;
+
+	if (first != list) {
+		struct mlist_head *last = list->prev;
+		struct mlist_head *at = head->next;
+
+		first->prev = head;
+		head->next = first;
+
+		last->next = at;
+		at->prev = last;
+	}
+}
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_list.h b/usr/src/uts/common/io/mr_sas/mr_sas_list.h
index 0c177712e0..9bd9947038 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas_list.h
+++ b/usr/src/uts/common/io/mr_sas/mr_sas_list.h
@@ -2,40 +2,8 @@
  * mr_sas_list.h: header for mr_sas
  *
  * Solaris MegaRAID driver for SAS2.0 controllers
- * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
  * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the author nor the names of its contributors may be
- *    used to endorse or promote products derived from this software without
- *    specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
  */
 
 #ifndef	_MR_SAS_LIST_H_
@@ -70,110 +38,22 @@ typedef struct mlist_head mlist_t;
 	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
 }
 
-#ifndef	KMDB_MODULE
-/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static void __list_add(struct mlist_head *new,
-	struct mlist_head *prev,
-	struct mlist_head *next)
-{
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	prev->next = new;
-}
-
-
-/*
- * mlist_add - add a new entry
- * @new: new entry to be added
- * @head: list head to add it after
- *
- * Insert a new entry after the specified head.
- * This is good for implementing stacks.
- */
-static void mlist_add(struct mlist_head *new, struct mlist_head *head)
-{
-	__list_add(new, head, head->next);
-}
-
-
-/*
- * mlist_add_tail - add a new entry
- * @new: new entry to be added
- * @head: list head to add it before
- *
- * Insert a new entry before the specified head.
- * This is useful for implementing queues.
- */
-static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
-{
-	__list_add(new, head->prev, head);
-}
-
 
+void mlist_add(struct mlist_head *, struct mlist_head *);
+void mlist_add_tail(struct mlist_head *, struct mlist_head *);
+#if 0
+void mlist_del(struct mlist_head *);
+#endif
+void mlist_del_init(struct mlist_head *);
+int mlist_empty(struct mlist_head *);
+void mlist_splice(struct mlist_head *, struct mlist_head *);
+
+/* TODO: set this */
+#if 0
+#pragma	inline(list_add, list_add_tail, __list_del, list_del,
+		list_del_init, list_empty, list_splice)
+#endif
 
-/*
- * Delete a list entry by making the prev/next entries
- * point to each other.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static void __list_del(struct mlist_head *prev,
-			struct mlist_head *next)
-{
-	next->prev = prev;
-	prev->next = next;
-}
-
-
-/*
- * mlist_del_init - deletes entry from list and reinitialize it.
- * @entry: the element to delete from the list.
- */
-static void mlist_del_init(struct mlist_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	INIT_LIST_HEAD(entry);
-}
-
-
-/*
- * mlist_empty - tests whether a list is empty
- * @head: the list to test.
- */
-static int mlist_empty(struct mlist_head *head)
-{
-	return (head->next == head);
-}
-
-
-/*
- * mlist_splice - join two lists
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- */
-static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
-{
-	struct mlist_head *first = list->next;
-
-	if (first != list) {
-		struct mlist_head *last = list->prev;
-		struct mlist_head *at = head->next;
-
-		first->prev = head;
-		head->next = first;
-
-		last->next = at;
-		at->prev = last;
-	}
-}
-#endif /* KMDB_MODULE */
 
 /*
  * mlist_entry - get the struct for this entry
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c
new file mode 100644
index 0000000000..e4c89c4cf6
--- /dev/null
+++ b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c
@@ -0,0 +1,3793 @@
+/*
+ * mr_sas_tbolt.c: source for mr_sas driver for New Generation.
+ * i.e. Thunderbolt and Invader
+ *
+ * Solaris MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2012, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ *		Swaminathan K S
+ *		Arun Chandrashekhar
+ *		Manju R
+ *		Rasheed
+ *		Shakeel Bukhari
+ */
+
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/atomic.h>
+#include <sys/scsi/scsi.h>
+#include <sys/byteorder.h>
+#include "ld_pd_map.h"
+#include "mr_sas.h"
+#include "fusion.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+
+/* Pre-TB command size and TB command size. */
+#define	MR_COMMAND_SIZE (64*20)	/* 1280 bytes */
+MR_LD_RAID *MR_LdRaidGet(U32 ld, MR_FW_RAID_MAP_ALL *map);
+U16 MR_TargetIdToLdGet(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map);
+U16 MR_GetLDTgtId(U32 ld, MR_FW_RAID_MAP_ALL *map);
+U16 get_updated_dev_handle(PLD_LOAD_BALANCE_INFO, struct IO_REQUEST_INFO *);
+extern ddi_dma_attr_t mrsas_generic_dma_attr;
+extern uint32_t mrsas_tbolt_max_cap_maxxfer;
+extern struct ddi_device_acc_attr endian_attr;
+extern int	debug_level_g;
+extern unsigned int	enable_fp;
+volatile int dump_io_wait_time = 90;
+extern void
+io_timeout_checker(void *arg);
+extern volatile int  debug_timeout_g;
+extern int	mrsas_issue_pending_cmds(struct mrsas_instance *);
+extern int mrsas_complete_pending_cmds(struct mrsas_instance *instance);
+extern void	push_pending_mfi_pkt(struct mrsas_instance *,
+			struct mrsas_cmd *);
+extern U8 MR_BuildRaidContext(struct mrsas_instance *, struct IO_REQUEST_INFO *,
+	    MPI2_SCSI_IO_VENDOR_UNIQUE *, MR_FW_RAID_MAP_ALL *);
+
+/* Local static prototypes. */
+static struct mrsas_cmd *mrsas_tbolt_build_cmd(struct mrsas_instance *,
+    struct scsi_address *, struct scsi_pkt *, uchar_t *);
+static void mrsas_tbolt_set_pd_lba(U8 cdb[], uint8_t *cdb_len_ptr,
+    U64 start_blk, U32 num_blocks);
+static int mrsas_tbolt_check_map_info(struct mrsas_instance *);
+static int mrsas_tbolt_sync_map_info(struct mrsas_instance *);
+static int mrsas_tbolt_prepare_pkt(struct scsa_cmd *);
+static int mrsas_tbolt_ioc_init(struct mrsas_instance *, dma_obj_t *);
+#ifdef PDSUPPORT
+static void mrsas_tbolt_get_pd_info(struct mrsas_instance *,
+    struct mrsas_tbolt_pd_info *, int);
+#endif /* PDSUPPORT */
+
+static int debug_tbolt_fw_faults_after_ocr_g = 0;
+
+/*
+ * destroy_mfi_mpi_frame_pool
+ */
+void
+destroy_mfi_mpi_frame_pool(struct mrsas_instance *instance)
+{
+	int	i;
+
+	struct mrsas_cmd	*cmd;
+
+	/* return all mfi frames to pool */
+	for (i = 0; i < MRSAS_APP_RESERVED_CMDS; i++) {
+		cmd = instance->cmd_list[i];
+		if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) {
+			(void) mrsas_free_dma_obj(instance,
+			    cmd->frame_dma_obj);
+		}
+		cmd->frame_dma_obj_status = DMA_OBJ_FREED;
+	}
+}
+
+/*
+ * destroy_mpi2_frame_pool
+ */
+void
+destroy_mpi2_frame_pool(struct mrsas_instance *instance)
+{
+
+	if (instance->mpi2_frame_pool_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->mpi2_frame_pool_dma_obj);
+		instance->mpi2_frame_pool_dma_obj.status |= DMA_OBJ_FREED;
+	}
+}
+
+
+/*
+ * mrsas_tbolt_free_additional_dma_buffer
+ */
+void
+mrsas_tbolt_free_additional_dma_buffer(struct mrsas_instance *instance)
+{
+	int i;
+
+	if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->mfi_internal_dma_obj);
+		instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	}
+	if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->mfi_evt_detail_obj);
+		instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+	}
+
+	for (i = 0; i < 2; i++) {
+		if (instance->ld_map_obj[i].status == DMA_OBJ_ALLOCATED) {
+			(void) mrsas_free_dma_obj(instance,
+			    instance->ld_map_obj[i]);
+			instance->ld_map_obj[i].status = DMA_OBJ_FREED;
+		}
+	}
+}
+
+
+/*
+ * free_req_desc_pool
+ */
+void
+free_req_rep_desc_pool(struct mrsas_instance *instance)
+{
+	if (instance->request_desc_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->request_desc_dma_obj);
+		instance->request_desc_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	if (instance->reply_desc_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->reply_desc_dma_obj);
+		instance->reply_desc_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+
+}
+
+
+/*
+ * ThunderBolt(TB) Request Message Frame Pool
+ */
+int
+create_mpi2_frame_pool(struct mrsas_instance *instance)
+{
+	int		i = 0;
+	uint16_t	max_cmd;
+	uint32_t	sgl_sz;
+	uint32_t	raid_msg_size;
+	uint32_t	total_size;
+	uint32_t	offset;
+	uint32_t	io_req_base_phys;
+	uint8_t		*io_req_base;
+	struct mrsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+
+	sgl_sz		= 1024;
+	raid_msg_size	= MRSAS_THUNDERBOLT_MSG_SIZE;
+
+	/* Allocating additional 256 bytes to accomodate SMID 0. */
+	total_size = MRSAS_THUNDERBOLT_MSG_SIZE + (max_cmd * raid_msg_size) +
+	    (max_cmd * sgl_sz) + (max_cmd * SENSE_LENGTH);
+
+	con_log(CL_ANN1, (CE_NOTE, "create_mpi2_frame_pool: "
+	    "max_cmd %x ", max_cmd));
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "create_mpi2_frame_pool: "
+	    "request message frame pool size %x", total_size));
+
+	/*
+	 * ThunderBolt(TB) We need to create a single chunk of DMA'ble memory
+	 * and then split the memory to 1024 commands. Each command should be
+	 * able to contain a RAID MESSAGE FRAME which will embed a MFI_FRAME
+	 * within it. Further refer the "alloc_req_rep_desc" function where
+	 * we allocate request/reply descriptors queues for a clue.
+	 */
+
+	instance->mpi2_frame_pool_dma_obj.size = total_size;
+	instance->mpi2_frame_pool_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_addr_hi =
+	    0xFFFFFFFFU;
+	instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_align = 256;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->mpi2_frame_pool_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN,
+		    "mr_sas: could not alloc mpi2 frame pool");
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mpi2_frame_pool_dma_obj.buffer, total_size);
+	instance->mpi2_frame_pool_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	instance->io_request_frames =
+	    (uint8_t *)instance->mpi2_frame_pool_dma_obj.buffer;
+	instance->io_request_frames_phy =
+	    (uint32_t)
+	    instance->mpi2_frame_pool_dma_obj.dma_cookie[0].dmac_address;
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "io_request_frames 0x%p",
+	    (void *)instance->io_request_frames));
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "io_request_frames_phy 0x%x",
+	    instance->io_request_frames_phy));
+
+	io_req_base = (uint8_t *)instance->io_request_frames +
+	    MRSAS_THUNDERBOLT_MSG_SIZE;
+	io_req_base_phys = instance->io_request_frames_phy +
+	    MRSAS_THUNDERBOLT_MSG_SIZE;
+
+	con_log(CL_DLEVEL3, (CE_NOTE,
+	    "io req_base_phys 0x%x", io_req_base_phys));
+
+	for (i = 0; i < max_cmd; i++) {
+		cmd = instance->cmd_list[i];
+
+		offset = i * MRSAS_THUNDERBOLT_MSG_SIZE;
+
+		cmd->scsi_io_request = (Mpi2RaidSCSIIORequest_t *)
+		    ((uint8_t *)io_req_base + offset);
+		cmd->scsi_io_request_phys_addr = io_req_base_phys + offset;
+
+		cmd->sgl = (Mpi2SGEIOUnion_t *)((uint8_t *)io_req_base +
+		    (max_cmd * raid_msg_size) + i * sgl_sz);
+
+		cmd->sgl_phys_addr = (io_req_base_phys +
+		    (max_cmd * raid_msg_size) + i * sgl_sz);
+
+		cmd->sense1 = (uint8_t *)((uint8_t *)io_req_base +
+		    (max_cmd * raid_msg_size) + (max_cmd * sgl_sz) +
+		    (i * SENSE_LENGTH));
+
+		cmd->sense_phys_addr1 = (io_req_base_phys +
+		    (max_cmd * raid_msg_size) + (max_cmd * sgl_sz) +
+		    (i * SENSE_LENGTH));
+
+
+		cmd->SMID = i + 1;
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Frame Pool Addr [%x]0x%p",
+		    cmd->index, (void *)cmd->scsi_io_request));
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Frame Pool Phys Addr [%x]0x%x",
+		    cmd->index, cmd->scsi_io_request_phys_addr));
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Sense Addr [%x]0x%p",
+		    cmd->index, (void *)cmd->sense1));
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Sense Addr Phys [%x]0x%x",
+		    cmd->index, cmd->sense_phys_addr1));
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Sgl bufffers [%x]0x%p",
+		    cmd->index, (void *)cmd->sgl));
+
+		con_log(CL_DLEVEL3, (CE_NOTE, "Sgl bufffers phys [%x]0x%x",
+		    cmd->index, cmd->sgl_phys_addr));
+	}
+
+	return (DDI_SUCCESS);
+
+}
+
+
+/*
+ * alloc_additional_dma_buffer for AEN
+ */
+int
+mrsas_tbolt_alloc_additional_dma_buffer(struct mrsas_instance *instance)
+{
+	uint32_t	internal_buf_size = PAGESIZE*2;
+	int i;
+
+	/* Initialize buffer status as free */
+	instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+	instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+	instance->ld_map_obj[0].status = DMA_OBJ_FREED;
+	instance->ld_map_obj[1].status = DMA_OBJ_FREED;
+
+
+	instance->mfi_internal_dma_obj.size = internal_buf_size;
+	instance->mfi_internal_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN,
+		    "mr_sas: could not alloc reply queue");
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+	instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+	instance->internal_buf =
+	    (caddr_t)(((unsigned long)instance->mfi_internal_dma_obj.buffer));
+	instance->internal_buf_dmac_add =
+	    instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address;
+	instance->internal_buf_size = internal_buf_size;
+
+	/* allocate evt_detail */
+	instance->mfi_evt_detail_obj.size = sizeof (struct mrsas_evt_detail);
+	instance->mfi_evt_detail_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 8;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN, "mrsas_tbolt_alloc_additional_dma_buffer: "
+		    "could not allocate data transfer buffer.");
+		goto fail_tbolt_additional_buff;
+	}
+
+	bzero(instance->mfi_evt_detail_obj.buffer,
+	    sizeof (struct mrsas_evt_detail));
+
+	instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+	instance->size_map_info = sizeof (MR_FW_RAID_MAP) +
+	    (sizeof (MR_LD_SPAN_MAP) * (MAX_LOGICAL_DRIVES - 1));
+
+	for (i = 0; i < 2; i++) {
+		/* allocate the data transfer buffer */
+		instance->ld_map_obj[i].size = instance->size_map_info;
+		instance->ld_map_obj[i].dma_attr = mrsas_generic_dma_attr;
+		instance->ld_map_obj[i].dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+		instance->ld_map_obj[i].dma_attr.dma_attr_count_max =
+		    0xFFFFFFFFU;
+		instance->ld_map_obj[i].dma_attr.dma_attr_sgllen = 1;
+		instance->ld_map_obj[i].dma_attr.dma_attr_align = 1;
+
+		if (mrsas_alloc_dma_obj(instance, &instance->ld_map_obj[i],
+		    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+			cmn_err(CE_WARN,
+			    "could not allocate data transfer buffer.");
+			goto fail_tbolt_additional_buff;
+		}
+
+		instance->ld_map_obj[i].status |= DMA_OBJ_ALLOCATED;
+
+		(void) memset(instance->ld_map_obj[i].buffer, 0,
+		    instance->size_map_info);
+
+		instance->ld_map[i] =
+		    (MR_FW_RAID_MAP_ALL *)instance->ld_map_obj[i].buffer;
+		instance->ld_map_phy[i] = (uint32_t)instance->
+		    ld_map_obj[i].dma_cookie[0].dmac_address;
+
+		con_log(CL_DLEVEL3, (CE_NOTE,
+		    "ld_map Addr Phys 0x%x", instance->ld_map_phy[i]));
+
+		con_log(CL_DLEVEL3, (CE_NOTE,
+		    "size_map_info 0x%x", instance->size_map_info));
+	}
+
+	return (DDI_SUCCESS);
+
+fail_tbolt_additional_buff:
+	mrsas_tbolt_free_additional_dma_buffer(instance);
+
+	return (DDI_FAILURE);
+}
+
+MRSAS_REQUEST_DESCRIPTOR_UNION *
+mr_sas_get_request_descriptor(struct mrsas_instance *instance, uint16_t index)
+{
+	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc;
+
+	if (index > instance->max_fw_cmds) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "Invalid SMID 0x%x request for descriptor", index));
+		con_log(CL_ANN1, (CE_NOTE,
+		    "max_fw_cmds : 0x%x\n", instance->max_fw_cmds));
+		return (NULL);
+	}
+
+	req_desc = (MRSAS_REQUEST_DESCRIPTOR_UNION *)
+	    ((char *)instance->request_message_pool +
+	    (sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION) * index));
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "request descriptor : 0x%08lx\n", (unsigned long)req_desc));
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "request descriptor base phy : 0x%08lx\n",
+	    (unsigned long)instance->request_message_pool_phy));
+
+	return ((MRSAS_REQUEST_DESCRIPTOR_UNION *)req_desc);
+}
+
+
+/*
+ * Allocate Request and Reply  Queue Descriptors.
+ */
+int
+alloc_req_rep_desc(struct mrsas_instance *instance)
+{
+	uint32_t	request_q_sz, reply_q_sz;
+	int		i, max_reply_q_sz;
+	MPI2_REPLY_DESCRIPTORS_UNION *reply_desc;
+
+	/*
+	 * ThunderBolt(TB) There's no longer producer consumer mechanism.
+	 * Once we have an interrupt we are supposed to scan through the list of
+	 * reply descriptors and process them accordingly. We would be needing
+	 * to allocate memory for 1024 reply descriptors
+	 */
+
+	/* Allocate Reply Descriptors */
+	con_log(CL_ANN1, (CE_NOTE, " reply q desc len = %x\n",
+	    (uint_t)sizeof (MPI2_REPLY_DESCRIPTORS_UNION)));
+
+	/* reply queue size should be multiple of 16 */
+	max_reply_q_sz = ((instance->max_fw_cmds + 1 + 15)/16)*16;
+
+	reply_q_sz = 8 * max_reply_q_sz;
+
+
+	con_log(CL_ANN1, (CE_NOTE, " reply q desc len = %x\n",
+	    (uint_t)sizeof (MPI2_REPLY_DESCRIPTORS_UNION)));
+
+	instance->reply_desc_dma_obj.size = reply_q_sz;
+	instance->reply_desc_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->reply_desc_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->reply_desc_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->reply_desc_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->reply_desc_dma_obj.dma_attr.dma_attr_align = 16;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->reply_desc_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN,
+		    "mr_sas: could not alloc reply queue");
+		return (DDI_FAILURE);
+	}
+
+	bzero(instance->reply_desc_dma_obj.buffer, reply_q_sz);
+	instance->reply_desc_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	/* virtual address of  reply queue */
+	instance->reply_frame_pool = (MPI2_REPLY_DESCRIPTORS_UNION *)(
+	    instance->reply_desc_dma_obj.buffer);
+
+	instance->reply_q_depth = max_reply_q_sz;
+
+	con_log(CL_ANN1, (CE_NOTE, "[reply queue depth]0x%x",
+	    instance->reply_q_depth));
+
+	con_log(CL_ANN1, (CE_NOTE, "[reply queue virt addr]0x%p",
+	    (void *)instance->reply_frame_pool));
+
+	/* initializing reply address to 0xFFFFFFFF */
+	reply_desc = instance->reply_frame_pool;
+
+	for (i = 0; i < instance->reply_q_depth; i++) {
+		reply_desc->Words = (uint64_t)~0;
+		reply_desc++;
+	}
+
+
+	instance->reply_frame_pool_phy =
+	    (uint32_t)instance->reply_desc_dma_obj.dma_cookie[0].dmac_address;
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "[reply queue phys addr]0x%x", instance->reply_frame_pool_phy));
+
+
+	instance->reply_pool_limit_phy = (instance->reply_frame_pool_phy +
+	    reply_q_sz);
+
+	con_log(CL_ANN1, (CE_NOTE, "[reply pool limit phys addr]0x%x",
+	    instance->reply_pool_limit_phy));
+
+
+	con_log(CL_ANN1, (CE_NOTE, " request q desc len = %x\n",
+	    (int)sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION)));
+
+	/* Allocate Request Descriptors */
+	con_log(CL_ANN1, (CE_NOTE, " request q desc len = %x\n",
+	    (int)sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION)));
+
+	request_q_sz = 8 *
+	    (instance->max_fw_cmds);
+
+	instance->request_desc_dma_obj.size = request_q_sz;
+	instance->request_desc_dma_obj.dma_attr	= mrsas_generic_dma_attr;
+	instance->request_desc_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->request_desc_dma_obj.dma_attr.dma_attr_count_max =
+	    0xFFFFFFFFU;
+	instance->request_desc_dma_obj.dma_attr.dma_attr_sgllen	= 1;
+	instance->request_desc_dma_obj.dma_attr.dma_attr_align = 16;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->request_desc_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN,
+		    "mr_sas: could not alloc request queue desc");
+		goto fail_undo_reply_queue;
+	}
+
+	bzero(instance->request_desc_dma_obj.buffer, request_q_sz);
+	instance->request_desc_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+	/* virtual address of  request queue desc */
+	instance->request_message_pool = (MRSAS_REQUEST_DESCRIPTOR_UNION *)
+	    (instance->request_desc_dma_obj.buffer);
+
+	instance->request_message_pool_phy =
+	    (uint32_t)instance->request_desc_dma_obj.dma_cookie[0].dmac_address;
+
+	return (DDI_SUCCESS);
+
+fail_undo_reply_queue:
+	if (instance->reply_desc_dma_obj.status == DMA_OBJ_ALLOCATED) {
+		(void) mrsas_free_dma_obj(instance,
+		    instance->reply_desc_dma_obj);
+		instance->reply_desc_dma_obj.status = DMA_OBJ_FREED;
+	}
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * mrsas_alloc_cmd_pool_tbolt
+ *
+ * TODO: merge tbolt-specific codee into mrsas_alloc_cmd_pool() to have single
+ * routine
+ */
+int
+mrsas_alloc_cmd_pool_tbolt(struct mrsas_instance *instance)
+{
+	int		i;
+	int		count;
+	uint32_t	max_cmd;
+	uint32_t	reserve_cmd;
+	size_t		sz;
+
+	struct mrsas_cmd	*cmd;
+
+	max_cmd = instance->max_fw_cmds;
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_alloc_cmd_pool: "
+	    "max_cmd %x", max_cmd));
+
+
+	sz = sizeof (struct mrsas_cmd *) * max_cmd;
+
+	/*
+	 * instance->cmd_list is an array of struct mrsas_cmd pointers.
+	 * Allocate the dynamic array first and then allocate individual
+	 * commands.
+	 */
+	instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+	if (instance->cmd_list == NULL) {
+		con_log(CL_NONE, (CE_WARN,
+		    "Failed to allocate memory for cmd_list"));
+		return (DDI_FAILURE);
+	}
+
+	/* create a frame pool and assign one frame to each cmd */
+	for (count = 0; count < max_cmd; count++) {
+		instance->cmd_list[count] =
+		    kmem_zalloc(sizeof (struct mrsas_cmd), KM_SLEEP);
+		if (instance->cmd_list[count] == NULL) {
+			con_log(CL_NONE, (CE_WARN,
+			    "Failed to allocate memory for mrsas_cmd"));
+			goto mrsas_undo_cmds;
+		}
+	}
+
+	/* add all the commands to command pool */
+
+	INIT_LIST_HEAD(&instance->cmd_pool_list);
+	INIT_LIST_HEAD(&instance->cmd_pend_list);
+	INIT_LIST_HEAD(&instance->cmd_app_pool_list);
+
+	reserve_cmd = MRSAS_APP_RESERVED_CMDS;
+
+	/* cmd index 0 reservered for IOC INIT */
+	for (i = 1; i < reserve_cmd; i++) {
+		cmd		= instance->cmd_list[i];
+		cmd->index	= i;
+		mlist_add_tail(&cmd->list, &instance->cmd_app_pool_list);
+	}
+
+
+	for (i = reserve_cmd; i < max_cmd; i++) {
+		cmd		= instance->cmd_list[i];
+		cmd->index	= i;
+		mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+	}
+
+	return (DDI_SUCCESS);
+
+mrsas_undo_cmds:
+	if (count > 0) {
+		/* free each cmd */
+		for (i = 0; i < count; i++) {
+			if (instance->cmd_list[i] != NULL) {
+				kmem_free(instance->cmd_list[i],
+				    sizeof (struct mrsas_cmd));
+			}
+			instance->cmd_list[i] = NULL;
+		}
+	}
+
+mrsas_undo_cmd_list:
+	if (instance->cmd_list != NULL)
+		kmem_free(instance->cmd_list, sz);
+	instance->cmd_list = NULL;
+
+	return (DDI_FAILURE);
+}
+
+
+/*
+ * free_space_for_mpi2
+ */
+void
+free_space_for_mpi2(struct mrsas_instance *instance)
+{
+	/* already freed */
+	if (instance->cmd_list == NULL) {
+		return;
+	}
+
+	/* First free the additional DMA buffer */
+	mrsas_tbolt_free_additional_dma_buffer(instance);
+
+	/* Free the request/reply descriptor pool */
+	free_req_rep_desc_pool(instance);
+
+	/*  Free the MPI message pool */
+	destroy_mpi2_frame_pool(instance);
+
+	/* Free the MFI frame pool */
+	destroy_mfi_frame_pool(instance);
+
+	/* Free all the commands in the cmd_list */
+	/* Free the cmd_list buffer itself */
+	mrsas_free_cmd_pool(instance);
+}
+
+
+/*
+ * ThunderBolt(TB) memory allocations for commands/messages/frames.
+ */
+int
+alloc_space_for_mpi2(struct mrsas_instance *instance)
+{
+	/* Allocate command pool (memory for cmd_list & individual commands) */
+	if (mrsas_alloc_cmd_pool_tbolt(instance)) {
+		cmn_err(CE_WARN, "Error creating cmd pool");
+		return (DDI_FAILURE);
+	}
+
+	/* Initialize single reply size and Message size */
+	instance->reply_size = MRSAS_THUNDERBOLT_REPLY_SIZE;
+	instance->raid_io_msg_size = MRSAS_THUNDERBOLT_MSG_SIZE;
+
+	instance->max_sge_in_main_msg = (MRSAS_THUNDERBOLT_MSG_SIZE -
+	    (sizeof (MPI2_RAID_SCSI_IO_REQUEST) -
+	    sizeof (MPI2_SGE_IO_UNION)))/ sizeof (MPI2_SGE_IO_UNION);
+	instance->max_sge_in_chain = (MR_COMMAND_SIZE -
+	    MRSAS_THUNDERBOLT_MSG_SIZE) / sizeof (MPI2_SGE_IO_UNION);
+
+	/* Reduce SG count by 1 to take care of group cmds feature in FW */
+	instance->max_num_sge = (instance->max_sge_in_main_msg +
+	    instance->max_sge_in_chain - 2);
+	instance->chain_offset_mpt_msg =
+	    offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 16;
+	instance->chain_offset_io_req = (MRSAS_THUNDERBOLT_MSG_SIZE -
+	    sizeof (MPI2_SGE_IO_UNION)) / 16;
+	instance->reply_read_index = 0;
+
+
+	/* Allocate Request and Reply descriptors Array */
+	/* Make sure the buffer is aligned to 8 for req/rep  descriptor Pool */
+	if (alloc_req_rep_desc(instance)) {
+		cmn_err(CE_WARN,
+		    "Error, allocating memory for descripter-pool");
+		goto mpi2_undo_cmd_pool;
+	}
+	con_log(CL_ANN1, (CE_NOTE, "[request message pool phys addr]0x%x",
+	    instance->request_message_pool_phy));
+
+
+	/* Allocate MFI Frame pool - for MPI-MFI passthru commands */
+	if (create_mfi_frame_pool(instance)) {
+		cmn_err(CE_WARN,
+		    "Error, allocating memory for MFI frame-pool");
+		goto mpi2_undo_descripter_pool;
+	}
+
+
+	/* Allocate MPI2 Message pool */
+	/*
+	 * Make sure the buffer is alligned to 256 for raid message packet
+	 * create a io request pool and assign one frame to each cmd
+	 */
+
+	if (create_mpi2_frame_pool(instance)) {
+		cmn_err(CE_WARN,
+		    "Error, allocating memory for MPI2 Message-pool");
+		goto mpi2_undo_mfi_frame_pool;
+	}
+
+#ifdef DEBUG
+	con_log(CL_ANN1, (CE_CONT, "[max_sge_in_main_msg]0x%x",
+	    instance->max_sge_in_main_msg));
+	con_log(CL_ANN1, (CE_CONT, "[max_sge_in_chain]0x%x",
+	    instance->max_sge_in_chain));
+	con_log(CL_ANN1, (CE_CONT,
+	    "[max_sge]0x%x", instance->max_num_sge));
+	con_log(CL_ANN1, (CE_CONT, "[chain_offset_mpt_msg]0x%x",
+	    instance->chain_offset_mpt_msg));
+	con_log(CL_ANN1, (CE_CONT, "[chain_offset_io_req]0x%x",
+	    instance->chain_offset_io_req));
+#endif
+
+
+	/* Allocate additional dma buffer */
+	if (mrsas_tbolt_alloc_additional_dma_buffer(instance)) {
+		cmn_err(CE_WARN,
+		    "Error, allocating tbolt additional DMA buffer");
+		goto mpi2_undo_message_pool;
+	}
+
+	return (DDI_SUCCESS);
+
+mpi2_undo_message_pool:
+	destroy_mpi2_frame_pool(instance);
+
+mpi2_undo_mfi_frame_pool:
+	destroy_mfi_frame_pool(instance);
+
+mpi2_undo_descripter_pool:
+	free_req_rep_desc_pool(instance);
+
+mpi2_undo_cmd_pool:
+	mrsas_free_cmd_pool(instance);
+
+	return (DDI_FAILURE);
+}
+
+
+/*
+ * mrsas_init_adapter_tbolt - Initialize fusion interface adapter.
+ */
+int
+mrsas_init_adapter_tbolt(struct mrsas_instance *instance)
+{
+
+	/*
+	 * Reduce the max supported cmds by 1. This is to ensure that the
+	 * reply_q_sz (1 more than the max cmd that driver may send)
+	 * does not exceed max cmds that the FW can support
+	 */
+
+	if (instance->max_fw_cmds > 1008) {
+		instance->max_fw_cmds = 1008;
+		instance->max_fw_cmds = instance->max_fw_cmds-1;
+	}
+
+	con_log(CL_ANN, (CE_NOTE, "mrsas_init_adapter_tbolt: "
+	    " instance->max_fw_cmds 0x%X.", instance->max_fw_cmds));
+
+
+	/* create a pool of commands */
+	if (alloc_space_for_mpi2(instance) != DDI_SUCCESS) {
+		cmn_err(CE_WARN,
+		    " alloc_space_for_mpi2() failed.");
+
+		return (DDI_FAILURE);
+	}
+
+	/* Send ioc init message */
+	/* NOTE: the issue_init call does FMA checking already. */
+	if (mrsas_issue_init_mpi2(instance) != DDI_SUCCESS) {
+		cmn_err(CE_WARN,
+		    " mrsas_issue_init_mpi2() failed.");
+
+		goto fail_init_fusion;
+	}
+
+	instance->unroll.alloc_space_mpi2 = 1;
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "mrsas_init_adapter_tbolt: SUCCESSFULL\n"));
+
+	return (DDI_SUCCESS);
+
+fail_init_fusion:
+	free_space_for_mpi2(instance);
+
+	return (DDI_FAILURE);
+}
+
+
+
+/*
+ * init_mpi2
+ */
+int
+mrsas_issue_init_mpi2(struct mrsas_instance *instance)
+{
+	dma_obj_t init2_dma_obj;
+	int ret_val = DDI_SUCCESS;
+
+	/* allocate DMA buffer for IOC INIT message */
+	init2_dma_obj.size = sizeof (Mpi2IOCInitRequest_t);
+	init2_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	init2_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	init2_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	init2_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	init2_dma_obj.dma_attr.dma_attr_align = 256;
+
+	if (mrsas_alloc_dma_obj(instance, &init2_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN, "mr_sas_issue_init_mpi2 "
+		    "could not allocate data transfer buffer.");
+		return (DDI_FAILURE);
+	}
+	(void) memset(init2_dma_obj.buffer, 2,
+	    sizeof (Mpi2IOCInitRequest_t));
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mrsas_issue_init_mpi2 _phys adr: %x \n",
+	    init2_dma_obj.dma_cookie[0].dmac_address));
+
+
+	/* Initialize and send ioc init message */
+	ret_val = mrsas_tbolt_ioc_init(instance, &init2_dma_obj);
+	if (ret_val == DDI_FAILURE) {
+		con_log(CL_ANN1, (CE_WARN,
+		    "mrsas_issue_init_mpi2: Failed\n"));
+		goto fail_init_mpi2;
+	}
+
+	/* free IOC init DMA buffer */
+	if (mrsas_free_dma_obj(instance, init2_dma_obj)
+	    != DDI_SUCCESS) {
+		con_log(CL_ANN1, (CE_WARN,
+		    "mrsas_issue_init_mpi2: Free Failed\n"));
+		return (DDI_FAILURE);
+	}
+
+	/* Get/Check and sync ld_map info */
+	instance->map_id = 0;
+	if (mrsas_tbolt_check_map_info(instance) == DDI_SUCCESS)
+		(void) mrsas_tbolt_sync_map_info(instance);
+
+
+	/* No mrsas_cmd to send, so send NULL. */
+	if (mrsas_common_check(instance, NULL) != DDI_SUCCESS)
+		goto fail_init_mpi2;
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "mrsas_issue_init_mpi2: SUCCESSFULL\n"));
+
+	return (DDI_SUCCESS);
+
+fail_init_mpi2:
+	(void) mrsas_free_dma_obj(instance, init2_dma_obj);
+
+	return (DDI_FAILURE);
+}
+
+static int
+mrsas_tbolt_ioc_init(struct mrsas_instance *instance, dma_obj_t *mpi2_dma_obj)
+{
+	int				numbytes;
+	uint16_t			flags;
+	struct mrsas_init_frame2	*mfiFrameInit2;
+	struct mrsas_header		*frame_hdr;
+	Mpi2IOCInitRequest_t		*init;
+	struct mrsas_cmd		*cmd = NULL;
+	struct mrsas_drv_ver		drv_ver_info;
+	MRSAS_REQUEST_DESCRIPTOR_UNION	*req_desc;
+
+	con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+
+#ifdef DEBUG
+	con_log(CL_ANN1, (CE_CONT, " mfiFrameInit2 len = %x\n",
+	    (int)sizeof (*mfiFrameInit2)));
+	con_log(CL_ANN1, (CE_CONT, " MPI len = %x\n", (int)sizeof (*init)));
+	con_log(CL_ANN1, (CE_CONT, " mfiFrameInit2 len = %x\n",
+	    (int)sizeof (struct mrsas_init_frame2)));
+	con_log(CL_ANN1, (CE_CONT, " MPI len = %x\n",
+	    (int)sizeof (Mpi2IOCInitRequest_t)));
+#endif
+
+	init = (Mpi2IOCInitRequest_t *)mpi2_dma_obj->buffer;
+	numbytes = sizeof (*init);
+	bzero(init, numbytes);
+
+	ddi_put8(mpi2_dma_obj->acc_handle, &init->Function,
+	    MPI2_FUNCTION_IOC_INIT);
+
+	ddi_put8(mpi2_dma_obj->acc_handle, &init->WhoInit,
+	    MPI2_WHOINIT_HOST_DRIVER);
+
+	/* set MsgVersion and HeaderVersion host driver was built with */
+	ddi_put16(mpi2_dma_obj->acc_handle, &init->MsgVersion,
+	    MPI2_VERSION);
+
+	ddi_put16(mpi2_dma_obj->acc_handle, &init->HeaderVersion,
+	    MPI2_HEADER_VERSION);
+
+	ddi_put16(mpi2_dma_obj->acc_handle, &init->SystemRequestFrameSize,
+	    instance->raid_io_msg_size / 4);
+
+	ddi_put16(mpi2_dma_obj->acc_handle, &init->ReplyFreeQueueDepth,
+	    0);
+
+	ddi_put16(mpi2_dma_obj->acc_handle,
+	    &init->ReplyDescriptorPostQueueDepth,
+	    instance->reply_q_depth);
+	/*
+	 * These addresses are set using the DMA cookie addresses from when the
+	 * memory was allocated.  Sense buffer hi address should be 0.
+	 * ddi_put32(accessp, &init->SenseBufferAddressHigh, 0);
+	 */
+
+	ddi_put32(mpi2_dma_obj->acc_handle,
+	    &init->SenseBufferAddressHigh, 0);
+
+	ddi_put64(mpi2_dma_obj->acc_handle,
+	    (uint64_t *)&init->SystemRequestFrameBaseAddress,
+	    instance->io_request_frames_phy);
+
+	ddi_put64(mpi2_dma_obj->acc_handle,
+	    &init->ReplyDescriptorPostQueueAddress,
+	    instance->reply_frame_pool_phy);
+
+	ddi_put64(mpi2_dma_obj->acc_handle,
+	    &init->ReplyFreeQueueAddress, 0);
+
+	cmd = instance->cmd_list[0];
+	if (cmd == NULL) {
+		return (DDI_FAILURE);
+	}
+	cmd->retry_count_for_ocr = 0;
+	cmd->pkt = NULL;
+	cmd->drv_pkt_time = 0;
+
+	mfiFrameInit2 = (struct mrsas_init_frame2 *)cmd->scsi_io_request;
+	con_log(CL_ANN1, (CE_CONT, "[mfi vaddr]%p", (void *)mfiFrameInit2));
+
+	frame_hdr = &cmd->frame->hdr;
+
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+
+	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+
+	flags	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+	con_log(CL_ANN, (CE_CONT,
+	    "mrsas_tbolt_ioc_init: SMID:%x\n", cmd->SMID));
+
+	/* Init the MFI Header */
+	ddi_put8(instance->mpi2_frame_pool_dma_obj.acc_handle,
+	    &mfiFrameInit2->cmd, MFI_CMD_OP_INIT);
+
+	con_log(CL_ANN1, (CE_CONT, "[CMD]%x", mfiFrameInit2->cmd));
+
+	ddi_put8(instance->mpi2_frame_pool_dma_obj.acc_handle,
+	    &mfiFrameInit2->cmd_status,
+	    MFI_STAT_INVALID_STATUS);
+
+	con_log(CL_ANN1, (CE_CONT, "[Status]%x", mfiFrameInit2->cmd_status));
+
+	ddi_put32(instance->mpi2_frame_pool_dma_obj.acc_handle,
+	    &mfiFrameInit2->queue_info_new_phys_addr_lo,
+	    mpi2_dma_obj->dma_cookie[0].dmac_address);
+
+	ddi_put32(instance->mpi2_frame_pool_dma_obj.acc_handle,
+	    &mfiFrameInit2->data_xfer_len,
+	    sizeof (Mpi2IOCInitRequest_t));
+
+	con_log(CL_ANN1, (CE_CONT, "[reply q desc addr]%x",
+	    (int)init->ReplyDescriptorPostQueueAddress));
+
+	/* fill driver version information */
+	fill_up_drv_ver(&drv_ver_info);
+
+	/* allocate the driver version data transfer buffer */
+	instance->drv_ver_dma_obj.size = sizeof (drv_ver_info.drv_ver);
+	instance->drv_ver_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	instance->drv_ver_dma_obj.dma_attr.dma_attr_align = 1;
+
+	if (mrsas_alloc_dma_obj(instance, &instance->drv_ver_dma_obj,
+	    (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+		cmn_err(CE_WARN,
+		    "fusion init: Could not allocate driver version buffer.");
+		return (DDI_FAILURE);
+	}
+	/* copy driver version to dma buffer */
+	(void) memset(instance->drv_ver_dma_obj.buffer, 0,
+	    sizeof (drv_ver_info.drv_ver));
+	ddi_rep_put8(cmd->frame_dma_obj.acc_handle,
+	    (uint8_t *)drv_ver_info.drv_ver,
+	    (uint8_t *)instance->drv_ver_dma_obj.buffer,
+	    sizeof (drv_ver_info.drv_ver), DDI_DEV_AUTOINCR);
+
+	/* send driver version physical address to firmware */
+	ddi_put64(cmd->frame_dma_obj.acc_handle, &mfiFrameInit2->driverversion,
+	    instance->drv_ver_dma_obj.dma_cookie[0].dmac_address);
+
+	con_log(CL_ANN1, (CE_CONT, "[MPIINIT2 frame Phys addr ]0x%x len = %x",
+	    mfiFrameInit2->queue_info_new_phys_addr_lo,
+	    (int)sizeof (Mpi2IOCInitRequest_t)));
+
+	con_log(CL_ANN1, (CE_CONT, "[Length]%x", mfiFrameInit2->data_xfer_len));
+
+	con_log(CL_ANN1, (CE_CONT, "[MFI frame Phys Address]%x len = %x",
+	    cmd->scsi_io_request_phys_addr,
+	    (int)sizeof (struct mrsas_init_frame2)));
+
+	/* disable interrupts before sending INIT2 frame */
+	instance->func_ptr->disable_intr(instance);
+
+	req_desc = (MRSAS_REQUEST_DESCRIPTOR_UNION *)
+	    instance->request_message_pool;
+	req_desc->Words = cmd->scsi_io_request_phys_addr;
+	req_desc->MFAIo.RequestFlags =
+	    (MPI2_REQ_DESCRIPT_FLAGS_MFA << MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+	cmd->request_desc = req_desc;
+
+	/* issue the init frame */
+	instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd);
+
+	con_log(CL_ANN1, (CE_CONT, "[cmd = %d] ", frame_hdr->cmd));
+	con_log(CL_ANN1, (CE_CONT, "[cmd  Status= %x] ",
+	    frame_hdr->cmd_status));
+
+	if (ddi_get8(instance->mpi2_frame_pool_dma_obj.acc_handle,
+	    &mfiFrameInit2->cmd_status) == 0) {
+		con_log(CL_ANN, (CE_NOTE, "INIT2 Success"));
+	} else {
+		con_log(CL_ANN, (CE_WARN, "INIT2 Fail"));
+		mrsas_dump_reply_desc(instance);
+		goto fail_ioc_init;
+	}
+
+	mrsas_dump_reply_desc(instance);
+
+	instance->unroll.verBuff = 1;
+
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_ioc_init: SUCCESSFULL\n"));
+
+	return (DDI_SUCCESS);
+
+
+fail_ioc_init:
+
+	(void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj);
+
+	return (DDI_FAILURE);
+}
+
+int
+wait_for_outstanding_poll_io(struct mrsas_instance *instance)
+{
+	int i;
+	uint32_t wait_time = dump_io_wait_time;
+	for (i = 0; i < wait_time; i++) {
+		/*
+		 * Check For Outstanding poll Commands
+		 * except ldsync command and aen command
+		 */
+		if (instance->fw_outstanding <= 2) {
+			break;
+		}
+		drv_usecwait(10*MILLISEC);
+		/* complete commands from reply queue */
+		(void) mr_sas_tbolt_process_outstanding_cmd(instance);
+	}
+	if (instance->fw_outstanding > 2) {
+		return (1);
+	}
+	return (0);
+}
+/*
+ * scsi_pkt handling
+ *
+ * Visible to the external world via the transport structure.
+ */
+
+int
+mrsas_tbolt_tran_start(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+	struct mrsas_instance	*instance = ADDR2MR(ap);
+	struct scsa_cmd		*acmd = PKT2CMD(pkt);
+	struct mrsas_cmd	*cmd = NULL;
+	uchar_t			cmd_done = 0;
+
+	con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+	if (instance->deadadapter == 1) {
+		cmn_err(CE_WARN,
+		    "mrsas_tran_start:TBOLT return TRAN_FATAL_ERROR "
+		    "for IO, as the HBA doesnt take any more IOs");
+		if (pkt) {
+			pkt->pkt_reason		= CMD_DEV_GONE;
+			pkt->pkt_statistics	= STAT_DISCON;
+		}
+		return (TRAN_FATAL_ERROR);
+	}
+	if (instance->adapterresetinprogress) {
+		con_log(CL_ANN, (CE_NOTE, "Reset flag set, "
+		    "returning mfi_pkt and setting TRAN_BUSY\n"));
+		return (TRAN_BUSY);
+	}
+	(void) mrsas_tbolt_prepare_pkt(acmd);
+
+	cmd = mrsas_tbolt_build_cmd(instance, ap, pkt, &cmd_done);
+
+	/*
+	 * Check if the command is already completed by the mrsas_build_cmd()
+	 * routine. In which case the busy_flag would be clear and scb will be
+	 * NULL and appropriate reason provided in pkt_reason field
+	 */
+	if (cmd_done) {
+		pkt->pkt_reason = CMD_CMPLT;
+		pkt->pkt_scbp[0] = STATUS_GOOD;
+		pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+		    | STATE_SENT_CMD;
+		if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+			(*pkt->pkt_comp)(pkt);
+		}
+
+		return (TRAN_ACCEPT);
+	}
+
+	if (cmd == NULL) {
+		return (TRAN_BUSY);
+	}
+
+
+	if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+		if (instance->fw_outstanding > instance->max_fw_cmds) {
+			cmn_err(CE_WARN,
+			    "Command Queue Full... Returning BUSY \n");
+			return_raid_msg_pkt(instance, cmd);
+			return (TRAN_BUSY);
+		}
+
+		/* Synchronize the Cmd frame for the controller */
+		(void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+		    DDI_DMA_SYNC_FORDEV);
+
+		con_log(CL_ANN, (CE_CONT, "tbolt_issue_cmd: SCSI CDB[0]=0x%x "
+		    "cmd->index:0x%x SMID 0x%x\n", pkt->pkt_cdbp[0],
+		    cmd->index, cmd->SMID));
+
+		instance->func_ptr->issue_cmd(cmd, instance);
+	} else {
+		instance->func_ptr->issue_cmd(cmd, instance);
+		(void) wait_for_outstanding_poll_io(instance);
+		(void) mrsas_common_check(instance, cmd);
+	}
+
+	return (TRAN_ACCEPT);
+}
+
+/*
+ * prepare the pkt:
+ * the pkt may have been resubmitted or just reused so
+ * initialize some fields and do some checks.
+ */
+static int
+mrsas_tbolt_prepare_pkt(struct scsa_cmd *acmd)
+{
+	struct scsi_pkt	*pkt = CMD2PKT(acmd);
+
+
+	/*
+	 * Reinitialize some fields that need it; the packet may
+	 * have been resubmitted
+	 */
+	pkt->pkt_reason = CMD_CMPLT;
+	pkt->pkt_state = 0;
+	pkt->pkt_statistics = 0;
+	pkt->pkt_resid = 0;
+
+	/*
+	 * zero status byte.
+	 */
+	*(pkt->pkt_scbp) = 0;
+
+	return (0);
+}
+
+
+int
+mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
+    struct scsa_cmd *acmd,
+    struct mrsas_cmd *cmd,
+    Mpi2RaidSCSIIORequest_t *scsi_raid_io,
+    uint32_t *datalen)
+{
+	uint32_t		MaxSGEs;
+	int			sg_to_process;
+	uint32_t		i, j;
+	uint32_t		numElements, endElement;
+	Mpi25IeeeSgeChain64_t	*ieeeChainElement = NULL;
+	Mpi25IeeeSgeChain64_t	*scsi_raid_io_sgl_ieee = NULL;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "chkpnt: Building Chained SGL :%d", __LINE__));
+
+	/* Calulate SGE size in number of Words(32bit) */
+	/* Clear the datalen before updating it. */
+	*datalen = 0;
+
+	MaxSGEs = instance->max_sge_in_main_msg;
+
+	ddi_put16(acc_handle, &scsi_raid_io->SGLFlags,
+	    MPI2_SGE_FLAGS_64_BIT_ADDRESSING);
+
+	/* set data transfer flag. */
+	if (acmd->cmd_flags & CFLAG_DMASEND) {
+		ddi_put32(acc_handle, &scsi_raid_io->Control,
+		    MPI2_SCSIIO_CONTROL_WRITE);
+	} else {
+		ddi_put32(acc_handle, &scsi_raid_io->Control,
+		    MPI2_SCSIIO_CONTROL_READ);
+	}
+
+
+	numElements = acmd->cmd_cookiecnt;
+
+	con_log(CL_DLEVEL1, (CE_NOTE, "[SGE Count]:%x", numElements));
+
+	if (numElements > instance->max_num_sge) {
+		con_log(CL_ANN, (CE_NOTE,
+		    "[Max SGE Count Exceeded]:%x", numElements));
+		return (numElements);
+	}
+
+	ddi_put8(acc_handle, &scsi_raid_io->RaidContext.numSGE,
+	    (uint8_t)numElements);
+
+	/* set end element in main message frame */
+	endElement = (numElements <= MaxSGEs) ? numElements : (MaxSGEs - 1);
+
+	/* prepare the scatter-gather list for the firmware */
+	scsi_raid_io_sgl_ieee =
+	    (Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain;
+
+	if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+		Mpi25IeeeSgeChain64_t *sgl_ptr_end = scsi_raid_io_sgl_ieee;
+		sgl_ptr_end += instance->max_sge_in_main_msg - 1;
+
+		ddi_put8(acc_handle, &sgl_ptr_end->Flags, 0);
+	}
+
+	for (i = 0; i < endElement; i++, scsi_raid_io_sgl_ieee++) {
+		ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address,
+		    acmd->cmd_dmacookies[i].dmac_laddress);
+
+		ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length,
+		    acmd->cmd_dmacookies[i].dmac_size);
+
+		ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0);
+
+		if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+			if (i == (numElements - 1)) {
+				ddi_put8(acc_handle,
+				    &scsi_raid_io_sgl_ieee->Flags,
+				    IEEE_SGE_FLAGS_END_OF_LIST);
+			}
+		}
+
+		*datalen += acmd->cmd_dmacookies[i].dmac_size;
+
+#ifdef DEBUG
+		con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Address]: %" PRIx64,
+		    scsi_raid_io_sgl_ieee->Address));
+		con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Length]:%x",
+		    scsi_raid_io_sgl_ieee->Length));
+		con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Flags]:%x",
+		    scsi_raid_io_sgl_ieee->Flags));
+#endif
+
+	}
+
+	ddi_put8(acc_handle, &scsi_raid_io->ChainOffset, 0);
+
+	/* check if chained SGL required */
+	if (i < numElements) {
+
+		con_log(CL_ANN1, (CE_NOTE, "[Chain Element index]:%x", i));
+
+		if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+			uint16_t ioFlags =
+			    ddi_get16(acc_handle, &scsi_raid_io->IoFlags);
+
+			if ((ioFlags &
+			    MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH) !=
+			    MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH) {
+				ddi_put8(acc_handle, &scsi_raid_io->ChainOffset,
+				    (U8)instance->chain_offset_io_req);
+			} else {
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->ChainOffset, 0);
+			}
+		} else {
+			ddi_put8(acc_handle, &scsi_raid_io->ChainOffset,
+			    (U8)instance->chain_offset_io_req);
+		}
+
+		/* prepare physical chain element */
+		ieeeChainElement = scsi_raid_io_sgl_ieee;
+
+		ddi_put8(acc_handle, &ieeeChainElement->NextChainOffset, 0);
+
+		if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+			ddi_put8(acc_handle, &ieeeChainElement->Flags,
+			    IEEE_SGE_FLAGS_CHAIN_ELEMENT);
+		} else {
+			ddi_put8(acc_handle, &ieeeChainElement->Flags,
+			    (IEEE_SGE_FLAGS_CHAIN_ELEMENT |
+			    MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR));
+		}
+
+		ddi_put32(acc_handle, &ieeeChainElement->Length,
+		    (sizeof (MPI2_SGE_IO_UNION) * (numElements - i)));
+
+		ddi_put64(acc_handle, &ieeeChainElement->Address,
+		    (U64)cmd->sgl_phys_addr);
+
+		sg_to_process = numElements - i;
+
+		con_log(CL_ANN1, (CE_NOTE,
+		    "[Additional SGE Count]:%x", endElement));
+
+		/* point to the chained SGL buffer */
+		scsi_raid_io_sgl_ieee = (Mpi25IeeeSgeChain64_t *)cmd->sgl;
+
+		/* build rest of the SGL in chained buffer */
+		for (j = 0; j < sg_to_process; j++, scsi_raid_io_sgl_ieee++) {
+			con_log(CL_DLEVEL3, (CE_NOTE, "[remaining SGL]:%x", i));
+
+			ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address,
+			    acmd->cmd_dmacookies[i].dmac_laddress);
+
+			ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length,
+			    acmd->cmd_dmacookies[i].dmac_size);
+
+			ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0);
+
+			if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+				if (i == (numElements - 1)) {
+					ddi_put8(acc_handle,
+					    &scsi_raid_io_sgl_ieee->Flags,
+					    IEEE_SGE_FLAGS_END_OF_LIST);
+				}
+			}
+
+			*datalen += acmd->cmd_dmacookies[i].dmac_size;
+
+#if DEBUG
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "[SGL Address]: %" PRIx64,
+			    scsi_raid_io_sgl_ieee->Address));
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "[SGL Length]:%x", scsi_raid_io_sgl_ieee->Length));
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "[SGL Flags]:%x", scsi_raid_io_sgl_ieee->Flags));
+#endif
+
+			i++;
+		}
+	}
+
+	return (0);
+} /*end of BuildScatterGather */
+
+
+/*
+ * build_cmd
+ */
+static struct mrsas_cmd *
+mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
+    struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+	uint8_t		fp_possible = 0;
+	uint32_t	index;
+	uint32_t	lba_count = 0;
+	uint32_t	start_lba_hi = 0;
+	uint32_t	start_lba_lo = 0;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
+	struct mrsas_cmd		*cmd = NULL;
+	struct scsa_cmd			*acmd = PKT2CMD(pkt);
+	MRSAS_REQUEST_DESCRIPTOR_UNION	*ReqDescUnion;
+	Mpi2RaidSCSIIORequest_t		*scsi_raid_io;
+	uint32_t			datalen;
+	struct IO_REQUEST_INFO io_info;
+	MR_FW_RAID_MAP_ALL *local_map_ptr;
+	uint16_t pd_cmd_cdblen;
+
+	con_log(CL_DLEVEL1, (CE_NOTE,
+	    "chkpnt: Entered mrsas_tbolt_build_cmd:%d", __LINE__));
+
+	/* find out if this is logical or physical drive command.  */
+	acmd->islogical = MRDRV_IS_LOGICAL(ap);
+	acmd->device_id = MAP_DEVICE_ID(instance, ap);
+
+	*cmd_done = 0;
+
+	/* get the command packet */
+	if (!(cmd = get_raid_msg_pkt(instance))) {
+		return (NULL);
+	}
+
+	index = cmd->index;
+	ReqDescUnion =	mr_sas_get_request_descriptor(instance, index);
+	ReqDescUnion->Words = 0;
+	ReqDescUnion->SCSIIO.SMID = cmd->SMID;
+	ReqDescUnion->SCSIIO.RequestFlags =
+	    (MPI2_REQ_DESCRIPT_FLAGS_LD_IO <<
+	    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+
+	cmd->request_desc = ReqDescUnion;
+	cmd->pkt = pkt;
+	cmd->cmd = acmd;
+
+	/* lets get the command directions */
+	if (acmd->cmd_flags & CFLAG_DMASEND) {
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORDEV);
+		}
+	} else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+		if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+			(void) ddi_dma_sync(acmd->cmd_dmahandle,
+			    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+			    DDI_DMA_SYNC_FORCPU);
+		}
+	} else {
+		con_log(CL_ANN, (CE_NOTE, "NO DMA\n"));
+	}
+
+
+	/* get SCSI_IO raid message frame pointer */
+	scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request;
+
+	/* zero out SCSI_IO raid message frame */
+	(void) memset(scsi_raid_io, 0, sizeof (Mpi2RaidSCSIIORequest_t));
+
+	/* Set the ldTargetId set by BuildRaidContext() */
+	ddi_put16(acc_handle, &scsi_raid_io->RaidContext.ldTargetId,
+	    acmd->device_id);
+
+	/*  Copy CDB to scsi_io_request message frame */
+	ddi_rep_put8(acc_handle,
+	    (uint8_t *)pkt->pkt_cdbp, (uint8_t *)scsi_raid_io->CDB.CDB32,
+	    acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+	/*
+	 * Just the CDB length, rest of the Flags are zero
+	 * This will be modified later.
+	 */
+	ddi_put16(acc_handle, &scsi_raid_io->IoFlags, acmd->cmd_cdblen);
+
+	pd_cmd_cdblen = acmd->cmd_cdblen;
+
+	switch (pkt->pkt_cdbp[0]) {
+	case SCMD_READ:
+	case SCMD_WRITE:
+	case SCMD_READ_G1:
+	case SCMD_WRITE_G1:
+	case SCMD_READ_G4:
+	case SCMD_WRITE_G4:
+	case SCMD_READ_G5:
+	case SCMD_WRITE_G5:
+
+		if (acmd->islogical) {
+			/* Initialize sense Information */
+			if (cmd->sense1 == NULL) {
+				con_log(CL_ANN, (CE_NOTE, "tbolt_build_cmd: "
+				    "Sense buffer ptr NULL \n"));
+			}
+			bzero(cmd->sense1, SENSE_LENGTH);
+			con_log(CL_DLEVEL2, (CE_NOTE, "tbolt_build_cmd "
+			    "CDB[0] = %x\n", pkt->pkt_cdbp[0]));
+
+			if (acmd->cmd_cdblen == CDB_GROUP0) {
+				/* 6-byte cdb */
+				lba_count = (uint16_t)(pkt->pkt_cdbp[4]);
+				start_lba_lo = ((uint32_t)(pkt->pkt_cdbp[3]) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+				    ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+				    << 16));
+			} else if (acmd->cmd_cdblen == CDB_GROUP1) {
+				/* 10-byte cdb */
+				lba_count =
+				    (((uint16_t)(pkt->pkt_cdbp[8])) |
+				    ((uint16_t)(pkt->pkt_cdbp[7]) << 8));
+
+				start_lba_lo =
+				    (((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24));
+
+			} else if (acmd->cmd_cdblen == CDB_GROUP5) {
+				/* 12-byte cdb */
+				lba_count = (
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24));
+
+				start_lba_lo =
+				    (((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24));
+
+			} else if (acmd->cmd_cdblen == CDB_GROUP4) {
+				/* 16-byte cdb */
+				lba_count = (
+				    ((uint32_t)(pkt->pkt_cdbp[13])) |
+				    ((uint32_t)(pkt->pkt_cdbp[12]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[11]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[10]) << 24));
+
+				start_lba_lo = (
+				    ((uint32_t)(pkt->pkt_cdbp[9])) |
+				    ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[6]) << 24));
+
+				start_lba_hi = (
+				    ((uint32_t)(pkt->pkt_cdbp[5])) |
+				    ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+				    ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+				    ((uint32_t)(pkt->pkt_cdbp[2]) << 24));
+			}
+
+			if (instance->tbolt &&
+			    ((lba_count * 512) > mrsas_tbolt_max_cap_maxxfer)) {
+				cmn_err(CE_WARN, " IO SECTOR COUNT exceeds "
+				    "controller limit 0x%x sectors\n",
+				    lba_count);
+			}
+
+			(void) memset(&io_info, 0,
+			    sizeof (struct IO_REQUEST_INFO));
+			io_info.ldStartBlock = ((uint64_t)start_lba_hi << 32) |
+			    start_lba_lo;
+			io_info.numBlocks = lba_count;
+			io_info.ldTgtId = acmd->device_id;
+
+			if (acmd->cmd_flags & CFLAG_DMASEND)
+				io_info.isRead = 0;
+			else
+				io_info.isRead = 1;
+
+
+			/* Acquire SYNC MAP UPDATE lock */
+			mutex_enter(&instance->sync_map_mtx);
+
+			local_map_ptr =
+			    instance->ld_map[(instance->map_id & 1)];
+
+			if ((MR_TargetIdToLdGet(
+			    acmd->device_id, local_map_ptr) >=
+			    MAX_LOGICAL_DRIVES) || !instance->fast_path_io) {
+				cmn_err(CE_NOTE, "Fast Path NOT Possible, "
+				    "targetId >= MAX_LOGICAL_DRIVES || "
+				    "!instance->fast_path_io\n");
+				fp_possible = 0;
+				/* Set Regionlock flags to BYPASS */
+				/* io_request->RaidContext.regLockFlags  = 0; */
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags, 0);
+			} else {
+				if (MR_BuildRaidContext(instance, &io_info,
+				    &scsi_raid_io->RaidContext, local_map_ptr))
+					fp_possible = io_info.fpOkForIo;
+			}
+
+			if (!enable_fp)
+				fp_possible = 0;
+
+			con_log(CL_ANN1, (CE_NOTE, "enable_fp %d  "
+			    "instance->fast_path_io %d fp_possible %d \n",
+			    enable_fp, instance->fast_path_io, fp_possible));
+
+		if (fp_possible) {
+
+			/* Check for DIF enabled LD */
+			if (MR_CheckDIF(acmd->device_id, local_map_ptr)) {
+				/* Prepare 32 Byte CDB for DIF capable Disk */
+				mrsas_tbolt_prepare_cdb(instance,
+				    scsi_raid_io->CDB.CDB32,
+				    &io_info, scsi_raid_io, start_lba_lo);
+			} else {
+				mrsas_tbolt_set_pd_lba(scsi_raid_io->CDB.CDB32,
+				    (uint8_t *)&pd_cmd_cdblen,
+				    io_info.pdBlock, io_info.numBlocks);
+				ddi_put16(acc_handle,
+				    &scsi_raid_io->IoFlags, pd_cmd_cdblen);
+			}
+
+			ddi_put8(acc_handle, &scsi_raid_io->Function,
+			    MPI2_FUNCTION_SCSI_IO_REQUEST);
+
+			ReqDescUnion->SCSIIO.RequestFlags =
+			    (MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY <<
+			    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+			if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+				uint8_t regLockFlags = ddi_get8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags);
+				uint16_t IoFlags = ddi_get16(acc_handle,
+				    &scsi_raid_io->IoFlags);
+
+				if (regLockFlags == REGION_TYPE_UNUSED)
+					ReqDescUnion->SCSIIO.RequestFlags =
+					    (MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK <<
+					    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+				IoFlags |=
+				    MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH;
+				regLockFlags |=
+				    (MR_RL_FLAGS_GRANT_DESTINATION_CUDA |
+				    MR_RL_FLAGS_SEQ_NUM_ENABLE);
+
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->ChainOffset, 0);
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.nsegType,
+				    ((0x01 << MPI2_NSEG_FLAGS_SHIFT) |
+				    MPI2_TYPE_CUDA));
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags,
+				    regLockFlags);
+				ddi_put16(acc_handle,
+				    &scsi_raid_io->IoFlags, IoFlags);
+			}
+
+			if ((instance->load_balance_info[
+			    acmd->device_id].loadBalanceFlag) &&
+			    (io_info.isRead)) {
+				io_info.devHandle =
+				    get_updated_dev_handle(&instance->
+				    load_balance_info[acmd->device_id],
+				    &io_info);
+				cmd->load_balance_flag |=
+				    MEGASAS_LOAD_BALANCE_FLAG;
+			} else {
+				cmd->load_balance_flag &=
+				    ~MEGASAS_LOAD_BALANCE_FLAG;
+			}
+
+			ReqDescUnion->SCSIIO.DevHandle = io_info.devHandle;
+			ddi_put16(acc_handle, &scsi_raid_io->DevHandle,
+			    io_info.devHandle);
+
+		} else {
+			ddi_put8(acc_handle, &scsi_raid_io->Function,
+			    MPI2_FUNCTION_LD_IO_REQUEST);
+
+			ddi_put16(acc_handle,
+			    &scsi_raid_io->DevHandle, acmd->device_id);
+
+			ReqDescUnion->SCSIIO.RequestFlags =
+			    (MPI2_REQ_DESCRIPT_FLAGS_LD_IO <<
+			    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+			ddi_put16(acc_handle,
+			    &scsi_raid_io->RaidContext.timeoutValue,
+			    local_map_ptr->raidMap.fpPdIoTimeoutSec);
+
+			if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+				uint8_t regLockFlags = ddi_get8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags);
+
+				if (regLockFlags == REGION_TYPE_UNUSED) {
+					ReqDescUnion->SCSIIO.RequestFlags =
+					    (MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK <<
+					    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+				}
+
+				regLockFlags |=
+				    (MR_RL_FLAGS_GRANT_DESTINATION_CPU0 |
+				    MR_RL_FLAGS_SEQ_NUM_ENABLE);
+
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.nsegType,
+				    ((0x01 << MPI2_NSEG_FLAGS_SHIFT) |
+				    MPI2_TYPE_CUDA));
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags,
+				    regLockFlags);
+			}
+		} /* Not FP */
+
+		/* Release SYNC MAP UPDATE lock */
+		mutex_exit(&instance->sync_map_mtx);
+
+
+		/*
+		 * Set sense buffer physical address/length in scsi_io_request.
+		 */
+		ddi_put32(acc_handle, &scsi_raid_io->SenseBufferLowAddress,
+		    cmd->sense_phys_addr1);
+		ddi_put8(acc_handle, &scsi_raid_io->SenseBufferLength,
+		    SENSE_LENGTH);
+
+		/* Construct SGL */
+		ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0,
+		    offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4);
+
+		(void) mr_sas_tbolt_build_sgl(instance, acmd, cmd,
+		    scsi_raid_io, &datalen);
+
+		ddi_put32(acc_handle, &scsi_raid_io->DataLength, datalen);
+
+		break;
+#ifndef PDSUPPORT	/* if PDSUPPORT, skip break and fall through */
+	} else {
+		break;
+#endif
+	}
+	/* fall through For all non-rd/wr cmds */
+	default:
+		switch (pkt->pkt_cdbp[0]) {
+		case 0x35: { /* SCMD_SYNCHRONIZE_CACHE */
+			return_raid_msg_pkt(instance, cmd);
+			*cmd_done = 1;
+			return (NULL);
+		}
+
+		case SCMD_MODE_SENSE:
+		case SCMD_MODE_SENSE_G1: {
+			union scsi_cdb	*cdbp;
+			uint16_t	page_code;
+
+			cdbp = (void *)pkt->pkt_cdbp;
+			page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+			switch (page_code) {
+			case 0x3:
+			case 0x4:
+				(void) mrsas_mode_sense_build(pkt);
+				return_raid_msg_pkt(instance, cmd);
+				*cmd_done = 1;
+				return (NULL);
+			}
+			break;
+		}
+
+		default: {
+			/*
+			 * Here we need to handle PASSTHRU for
+			 * Logical Devices. Like Inquiry etc.
+			 */
+
+			if (!(acmd->islogical)) {
+
+				/* Acquire SYNC MAP UPDATE lock */
+				mutex_enter(&instance->sync_map_mtx);
+
+				local_map_ptr =
+				    instance->ld_map[(instance->map_id & 1)];
+
+				ddi_put8(acc_handle, &scsi_raid_io->Function,
+				    MPI2_FUNCTION_SCSI_IO_REQUEST);
+
+				ReqDescUnion->SCSIIO.RequestFlags =
+				    (MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY <<
+				    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+				ddi_put16(acc_handle, &scsi_raid_io->DevHandle,
+				    local_map_ptr->raidMap.
+				    devHndlInfo[acmd->device_id].curDevHdl);
+
+
+				/* Set regLockFlasgs to REGION_TYPE_BYPASS */
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockFlags, 0);
+				ddi_put64(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockRowLBA,
+				    0);
+				ddi_put32(acc_handle,
+				    &scsi_raid_io->RaidContext.regLockLength,
+				    0);
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->RaidContext.RAIDFlags,
+				    MR_RAID_FLAGS_IO_SUB_TYPE_SYSTEM_PD <<
+				    MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_SHIFT);
+				ddi_put16(acc_handle,
+				    &scsi_raid_io->RaidContext.timeoutValue,
+				    local_map_ptr->raidMap.fpPdIoTimeoutSec);
+				ddi_put16(acc_handle,
+				    &scsi_raid_io->RaidContext.ldTargetId,
+				    acmd->device_id);
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->LUN[1], acmd->lun);
+
+				/* Release SYNC MAP UPDATE lock */
+				mutex_exit(&instance->sync_map_mtx);
+
+			} else {
+				ddi_put8(acc_handle, &scsi_raid_io->Function,
+				    MPI2_FUNCTION_LD_IO_REQUEST);
+				ddi_put8(acc_handle,
+				    &scsi_raid_io->LUN[1], acmd->lun);
+				ddi_put16(acc_handle,
+				    &scsi_raid_io->DevHandle, acmd->device_id);
+				ReqDescUnion->SCSIIO.RequestFlags =
+				    (MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO <<
+				    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+			}
+
+			/*
+			 * Set sense buffer physical address/length in
+			 * scsi_io_request.
+			 */
+			ddi_put32(acc_handle,
+			    &scsi_raid_io->SenseBufferLowAddress,
+			    cmd->sense_phys_addr1);
+			ddi_put8(acc_handle,
+			    &scsi_raid_io->SenseBufferLength, SENSE_LENGTH);
+
+			/* Construct SGL */
+			ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0,
+			    offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4);
+
+			(void) mr_sas_tbolt_build_sgl(instance, acmd, cmd,
+			    scsi_raid_io, &datalen);
+
+			ddi_put32(acc_handle,
+			    &scsi_raid_io->DataLength, datalen);
+
+
+			con_log(CL_ANN, (CE_CONT,
+			    "tbolt_build_cmd CDB[0] =%x, TargetID =%x\n",
+			    pkt->pkt_cdbp[0], acmd->device_id));
+			con_log(CL_DLEVEL1, (CE_CONT,
+			    "data length = %x\n",
+			    scsi_raid_io->DataLength));
+			con_log(CL_DLEVEL1, (CE_CONT,
+			    "cdb length = %x\n",
+			    acmd->cmd_cdblen));
+		}
+			break;
+		}
+
+	}
+
+	return (cmd);
+}
+
+/*
+ * mrsas_tbolt_tran_init_pkt - allocate & initialize a scsi_pkt structure
+ * @ap:
+ * @pkt:
+ * @bp:
+ * @cmdlen:
+ * @statuslen:
+ * @tgtlen:
+ * @flags:
+ * @callback:
+ *
+ * The tran_init_pkt() entry point allocates and initializes a scsi_pkt
+ * structure and DMA resources for a target driver request. The
+ * tran_init_pkt() entry point is called when the target driver calls the
+ * SCSA function scsi_init_pkt(). Each call of the tran_init_pkt() entry point
+ * is a request to perform one or more of three possible services:
+ *  - allocation and initialization of a scsi_pkt structure
+ *  - allocation of DMA resources for data transfer
+ *  - reallocation of DMA resources for the next portion of the data transfer
+ */
+struct scsi_pkt *
+mrsas_tbolt_tran_init_pkt(struct scsi_address *ap,
+	register struct scsi_pkt *pkt,
+	struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+	int flags, int (*callback)(), caddr_t arg)
+{
+	struct scsa_cmd	*acmd;
+	struct mrsas_instance	*instance;
+	struct scsi_pkt	*new_pkt;
+
+	instance = ADDR2MR(ap);
+
+	/* step #1 : pkt allocation */
+	if (pkt == NULL) {
+		pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+		    tgtlen, sizeof (struct scsa_cmd), callback, arg);
+		if (pkt == NULL) {
+			return (NULL);
+		}
+
+		acmd = PKT2CMD(pkt);
+
+		/*
+		 * Initialize the new pkt - we redundantly initialize
+		 * all the fields for illustrative purposes.
+		 */
+		acmd->cmd_pkt		= pkt;
+		acmd->cmd_flags		= 0;
+		acmd->cmd_scblen	= statuslen;
+		acmd->cmd_cdblen	= cmdlen;
+		acmd->cmd_dmahandle	= NULL;
+		acmd->cmd_ncookies	= 0;
+		acmd->cmd_cookie	= 0;
+		acmd->cmd_cookiecnt	= 0;
+		acmd->cmd_nwin		= 0;
+
+		pkt->pkt_address	= *ap;
+		pkt->pkt_comp		= (void (*)())NULL;
+		pkt->pkt_flags		= 0;
+		pkt->pkt_time		= 0;
+		pkt->pkt_resid		= 0;
+		pkt->pkt_state		= 0;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_reason		= 0;
+		new_pkt			= pkt;
+	} else {
+		acmd = PKT2CMD(pkt);
+		new_pkt = NULL;
+	}
+
+	/* step #2 : dma allocation/move */
+	if (bp && bp->b_bcount != 0) {
+		if (acmd->cmd_dmahandle == NULL) {
+			if (mrsas_dma_alloc(instance, pkt, bp, flags,
+			    callback) == DDI_FAILURE) {
+				if (new_pkt) {
+					scsi_hba_pkt_free(ap, new_pkt);
+				}
+				return ((struct scsi_pkt *)NULL);
+			}
+		} else {
+			if (mrsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+				return ((struct scsi_pkt *)NULL);
+			}
+		}
+	}
+	return (pkt);
+}
+
+
+uint32_t
+tbolt_read_fw_status_reg(struct mrsas_instance *instance)
+{
+	return ((uint32_t)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+void
+tbolt_issue_cmd(struct mrsas_cmd *cmd, struct mrsas_instance *instance)
+{
+	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc;
+	atomic_add_16(&instance->fw_outstanding, 1);
+
+	struct scsi_pkt *pkt;
+
+	con_log(CL_ANN1,
+	    (CE_NOTE, "tbolt_issue_cmd: cmd->[SMID]=0x%X", cmd->SMID));
+
+	con_log(CL_DLEVEL1, (CE_CONT,
+	    " [req desc Words] %" PRIx64 " \n", req_desc->Words));
+	con_log(CL_DLEVEL1, (CE_CONT,
+	    " [req desc low part] %x \n",
+	    (uint_t)(req_desc->Words & 0xffffffffff)));
+	con_log(CL_DLEVEL1, (CE_CONT,
+	    " [req desc high part] %x \n", (uint_t)(req_desc->Words >> 32)));
+	pkt = cmd->pkt;
+
+	if (pkt) {
+		con_log(CL_ANN1, (CE_CONT, "%llx :TBOLT issue_cmd_ppc:"
+		    "ISSUED CMD TO FW : called : cmd:"
+		    ": %p instance : %p pkt : %p pkt_time : %x\n",
+		    gethrtime(), (void *)cmd, (void *)instance,
+		    (void *)pkt, cmd->drv_pkt_time));
+		if (instance->adapterresetinprogress) {
+			cmd->drv_pkt_time = (uint16_t)debug_timeout_g;
+			con_log(CL_ANN, (CE_NOTE,
+			    "TBOLT Reset the scsi_pkt timer"));
+		} else {
+			push_pending_mfi_pkt(instance, cmd);
+		}
+
+	} else {
+		con_log(CL_ANN1, (CE_CONT, "%llx :TBOLT issue_cmd_ppc:"
+		    "ISSUED CMD TO FW : called : cmd : %p, instance: %p"
+		    "(NO PKT)\n", gethrtime(), (void *)cmd, (void *)instance));
+	}
+
+	/* Issue the command to the FW */
+	mutex_enter(&instance->reg_write_mtx);
+	WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance);
+	WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance);
+	mutex_exit(&instance->reg_write_mtx);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+int
+tbolt_issue_cmd_in_sync_mode(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd)
+{
+	int		i;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc;
+
+	struct mrsas_header	*hdr;
+	hdr = (struct mrsas_header *)&cmd->frame->hdr;
+
+	con_log(CL_ANN,
+	    (CE_NOTE, "tbolt_issue_cmd_in_sync_mode: cmd->[SMID]=0x%X",
+	    cmd->SMID));
+
+
+	if (instance->adapterresetinprogress) {
+		cmd->drv_pkt_time = ddi_get16
+		    (cmd->frame_dma_obj.acc_handle, &hdr->timeout);
+		if (cmd->drv_pkt_time < debug_timeout_g)
+			cmd->drv_pkt_time = (uint16_t)debug_timeout_g;
+		con_log(CL_ANN, (CE_NOTE, "tbolt_issue_cmd_in_sync_mode:"
+		    "RESET-IN-PROGRESS, issue cmd & return.\n"));
+
+		mutex_enter(&instance->reg_write_mtx);
+		WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance);
+		WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance);
+		mutex_exit(&instance->reg_write_mtx);
+
+		return (DDI_SUCCESS);
+	} else {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "tbolt_issue_cmd_in_sync_mode: pushing the pkt\n"));
+		push_pending_mfi_pkt(instance, cmd);
+	}
+
+	con_log(CL_DLEVEL2, (CE_NOTE,
+	    "HighQport offset :%p",
+	    (void *)((uintptr_t)(instance)->regmap + IB_HIGH_QPORT)));
+	con_log(CL_DLEVEL2, (CE_NOTE,
+	    "LowQport offset :%p",
+	    (void *)((uintptr_t)(instance)->regmap + IB_LOW_QPORT)));
+
+	cmd->sync_cmd = MRSAS_TRUE;
+	cmd->cmd_status =  ENODATA;
+
+
+	mutex_enter(&instance->reg_write_mtx);
+	WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance);
+	WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance);
+	mutex_exit(&instance->reg_write_mtx);
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    " req desc high part %x \n", (uint_t)(req_desc->Words >> 32)));
+	con_log(CL_ANN1, (CE_NOTE, " req desc low part %x \n",
+	    (uint_t)(req_desc->Words & 0xffffffff)));
+
+	mutex_enter(&instance->int_cmd_mtx);
+	for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+		cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+	}
+	mutex_exit(&instance->int_cmd_mtx);
+
+
+	if (i < (msecs -1)) {
+		return (DDI_SUCCESS);
+	} else {
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+int
+tbolt_issue_cmd_in_poll_mode(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd)
+{
+	int		i;
+	uint16_t	flags;
+	uint32_t	msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+	struct mrsas_header *frame_hdr;
+
+	con_log(CL_ANN,
+	    (CE_NOTE, "tbolt_issue_cmd_in_poll_mode: cmd->[SMID]=0x%X",
+	    cmd->SMID));
+
+	MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc;
+
+	frame_hdr = (struct mrsas_header *)&cmd->frame->hdr;
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+	    MFI_CMD_STATUS_POLL_MODE);
+	flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+	flags	|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+	con_log(CL_ANN1, (CE_NOTE, " req desc low part %x \n",
+	    (uint_t)(req_desc->Words & 0xffffffff)));
+	con_log(CL_ANN1, (CE_NOTE,
+	    " req desc high part %x \n", (uint_t)(req_desc->Words >> 32)));
+
+	/* issue the frame using inbound queue port */
+	mutex_enter(&instance->reg_write_mtx);
+	WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance);
+	WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance);
+	mutex_exit(&instance->reg_write_mtx);
+
+	for (i = 0; i < msecs && (
+	    ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+	    == MFI_CMD_STATUS_POLL_MODE); i++) {
+		/* wait for cmd_status to change from 0xFF */
+		drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+	}
+
+	if (ddi_get8(cmd->frame_dma_obj.acc_handle,
+	    &frame_hdr->cmd_status) == MFI_CMD_STATUS_POLL_MODE) {
+		con_log(CL_ANN1, (CE_NOTE,
+		    " cmd failed %" PRIx64 " \n", (req_desc->Words)));
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+void
+tbolt_enable_intr(struct mrsas_instance *instance)
+{
+	/* TODO: For Thunderbolt/Invader also clear intr on enable */
+	/* writel(~0, &regs->outbound_intr_status); */
+	/* readl(&regs->outbound_intr_status); */
+
+	WR_OB_INTR_MASK(~(MFI_FUSION_ENABLE_INTERRUPT_MASK), instance);
+
+	/* dummy read to force PCI flush */
+	(void) RD_OB_INTR_MASK(instance);
+
+}
+
+void
+tbolt_disable_intr(struct mrsas_instance *instance)
+{
+	uint32_t mask = 0xFFFFFFFF;
+
+	WR_OB_INTR_MASK(mask, instance);
+
+	/* Dummy readl to force pci flush */
+
+	(void) RD_OB_INTR_MASK(instance);
+}
+
+
+int
+tbolt_intr_ack(struct mrsas_instance *instance)
+{
+	uint32_t	status;
+
+	/* check if it is our interrupt */
+	status = RD_OB_INTR_STATUS(instance);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "chkpnt: Entered tbolt_intr_ack status = %d \n", status));
+
+	if (!(status & MFI_FUSION_ENABLE_INTERRUPT_MASK)) {
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	if (mrsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	if ((status & 1) || (status & MFI_FUSION_ENABLE_INTERRUPT_MASK)) {
+		/* clear the interrupt by writing back the same value */
+		WR_OB_INTR_STATUS(status, instance);
+		/* dummy READ */
+		(void) RD_OB_INTR_STATUS(instance);
+	}
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * get_raid_msg_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+
+struct mrsas_cmd *
+get_raid_msg_pkt(struct mrsas_instance *instance)
+{
+	mlist_t			*head = &instance->cmd_pool_list;
+	struct mrsas_cmd	*cmd = NULL;
+
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+
+	if (!mlist_empty(head)) {
+		cmd = mlist_entry(head->next, struct mrsas_cmd, list);
+		mlist_del_init(head->next);
+	}
+	if (cmd != NULL) {
+		cmd->pkt = NULL;
+		cmd->retry_count_for_ocr = 0;
+		cmd->drv_pkt_time = 0;
+	}
+	mutex_exit(&instance->cmd_pool_mtx);
+
+	if (cmd != NULL)
+		bzero(cmd->scsi_io_request,
+		    sizeof (Mpi2RaidSCSIIORequest_t));
+	return (cmd);
+}
+
+struct mrsas_cmd *
+get_raid_msg_mfi_pkt(struct mrsas_instance *instance)
+{
+	mlist_t			*head = &instance->cmd_app_pool_list;
+	struct mrsas_cmd	*cmd = NULL;
+
+	mutex_enter(&instance->cmd_app_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_app_pool_mtx));
+
+	if (!mlist_empty(head)) {
+		cmd = mlist_entry(head->next, struct mrsas_cmd, list);
+		mlist_del_init(head->next);
+	}
+	if (cmd != NULL) {
+		cmd->retry_count_for_ocr = 0;
+		cmd->drv_pkt_time = 0;
+		cmd->pkt = NULL;
+		cmd->request_desc = NULL;
+
+	}
+
+	mutex_exit(&instance->cmd_app_pool_mtx);
+
+	if (cmd != NULL) {
+		bzero(cmd->scsi_io_request,
+		    sizeof (Mpi2RaidSCSIIORequest_t));
+	}
+
+	return (cmd);
+}
+
+/*
+ * return_raid_msg_pkt : Return a cmd to free command pool
+ */
+void
+return_raid_msg_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
+{
+	mutex_enter(&instance->cmd_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+
+	mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+
+	mutex_exit(&instance->cmd_pool_mtx);
+}
+
+void
+return_raid_msg_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd)
+{
+	mutex_enter(&instance->cmd_app_pool_mtx);
+	ASSERT(mutex_owned(&instance->cmd_app_pool_mtx));
+
+	mlist_add_tail(&cmd->list, &instance->cmd_app_pool_list);
+
+	mutex_exit(&instance->cmd_app_pool_mtx);
+}
+
+
+void
+mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd)
+{
+	Mpi2RaidSCSIIORequest_t		*scsi_raid_io;
+	Mpi25IeeeSgeChain64_t		*scsi_raid_io_sgl_ieee;
+	MRSAS_REQUEST_DESCRIPTOR_UNION	*ReqDescUnion;
+	uint32_t			index;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
+
+	if (!instance->tbolt) {
+		con_log(CL_ANN, (CE_NOTE, "Not MFA enabled.\n"));
+		return;
+	}
+
+	index = cmd->index;
+
+	ReqDescUnion = mr_sas_get_request_descriptor(instance, index);
+
+	if (!ReqDescUnion) {
+		con_log(CL_ANN1, (CE_NOTE, "[NULL REQDESC]"));
+		return;
+	}
+
+	con_log(CL_ANN1, (CE_NOTE, "[SMID]%x", cmd->SMID));
+
+	ReqDescUnion->Words = 0;
+
+	ReqDescUnion->SCSIIO.RequestFlags =
+	    (MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO <<
+	    MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
+
+	ReqDescUnion->SCSIIO.SMID = cmd->SMID;
+
+	cmd->request_desc = ReqDescUnion;
+
+	/* get raid message frame pointer */
+	scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request;
+
+	if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) {
+		Mpi25IeeeSgeChain64_t *sgl_ptr_end = (Mpi25IeeeSgeChain64_t *)
+		    &scsi_raid_io->SGL.IeeeChain;
+		sgl_ptr_end += instance->max_sge_in_main_msg - 1;
+		ddi_put8(acc_handle, &sgl_ptr_end->Flags, 0);
+	}
+
+	ddi_put8(acc_handle, &scsi_raid_io->Function,
+	    MPI2_FUNCTION_PASSTHRU_IO_REQUEST);
+
+	ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0,
+	    offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4);
+
+	ddi_put8(acc_handle, &scsi_raid_io->ChainOffset,
+	    (U8)offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 16);
+
+	ddi_put32(acc_handle, &scsi_raid_io->SenseBufferLowAddress,
+	    cmd->sense_phys_addr1);
+
+
+	scsi_raid_io_sgl_ieee =
+	    (Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain;
+
+	ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address,
+	    (U64)cmd->frame_phys_addr);
+
+	ddi_put8(acc_handle,
+	    &scsi_raid_io_sgl_ieee->Flags, (IEEE_SGE_FLAGS_CHAIN_ELEMENT |
+	    MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR));
+	/* LSI put hardcoded 1024 instead of MEGASAS_MAX_SZ_CHAIN_FRAME. */
+	ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length, 1024);
+
+	con_log(CL_ANN1, (CE_NOTE,
+	    "[MFI CMD PHY ADDRESS]:%" PRIx64,
+	    scsi_raid_io_sgl_ieee->Address));
+	con_log(CL_ANN1, (CE_NOTE,
+	    "[SGL Length]:%x", scsi_raid_io_sgl_ieee->Length));
+	con_log(CL_ANN1, (CE_NOTE, "[SGL Flags]:%x",
+	    scsi_raid_io_sgl_ieee->Flags));
+}
+
+
+void
+tbolt_complete_cmd(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd)
+{
+	uint8_t				status;
+	uint8_t				extStatus;
+	uint8_t				arm;
+	struct scsa_cmd			*acmd;
+	struct scsi_pkt			*pkt;
+	struct scsi_arq_status		*arqstat;
+	Mpi2RaidSCSIIORequest_t		*scsi_raid_io;
+	LD_LOAD_BALANCE_INFO		*lbinfo;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
+
+	scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request;
+
+	status = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.status);
+	extStatus = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.extStatus);
+
+	con_log(CL_DLEVEL3, (CE_NOTE, "status %x", status));
+	con_log(CL_DLEVEL3, (CE_NOTE, "extStatus %x", extStatus));
+
+	if (status != MFI_STAT_OK) {
+		con_log(CL_ANN, (CE_WARN,
+		    "IO Cmd Failed SMID %x", cmd->SMID));
+	} else {
+		con_log(CL_ANN, (CE_NOTE,
+		    "IO Cmd Success  SMID %x", cmd->SMID));
+	}
+
+	/* regular commands */
+
+	switch (ddi_get8(acc_handle, &scsi_raid_io->Function)) {
+
+	case MPI2_FUNCTION_SCSI_IO_REQUEST :  /* Fast Path IO. */
+		acmd =	(struct scsa_cmd *)cmd->cmd;
+		lbinfo = &instance->load_balance_info[acmd->device_id];
+
+		if (cmd->load_balance_flag & MEGASAS_LOAD_BALANCE_FLAG) {
+			arm = lbinfo->raid1DevHandle[0] ==
+			    scsi_raid_io->DevHandle ? 0 : 1;
+
+			lbinfo->scsi_pending_cmds[arm]--;
+			cmd->load_balance_flag &= ~MEGASAS_LOAD_BALANCE_FLAG;
+		}
+		con_log(CL_DLEVEL3, (CE_NOTE,
+		    "FastPath IO Completion Success "));
+		/* FALLTHRU */
+
+	case MPI2_FUNCTION_LD_IO_REQUEST :   { /* Regular Path IO. */
+		acmd =	(struct scsa_cmd *)cmd->cmd;
+		pkt =	(struct scsi_pkt *)CMD2PKT(acmd);
+
+		if (acmd->cmd_flags & CFLAG_DMAVALID) {
+			if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+				(void) ddi_dma_sync(acmd->cmd_dmahandle,
+				    acmd->cmd_dma_offset, acmd->cmd_dma_len,
+				    DDI_DMA_SYNC_FORCPU);
+			}
+		}
+
+		pkt->pkt_reason		= CMD_CMPLT;
+		pkt->pkt_statistics	= 0;
+		pkt->pkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+		    STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+		con_log(CL_ANN, (CE_CONT, " CDB[0] = %x completed for %s: "
+		    "size %lx SMID %x cmd_status %x", pkt->pkt_cdbp[0],
+		    ((acmd->islogical) ? "LD" : "PD"),
+		    acmd->cmd_dmacount, cmd->SMID, status));
+
+		if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+			struct scsi_inquiry	*inq;
+
+			if (acmd->cmd_dmacount != 0) {
+				bp_mapin(acmd->cmd_buf);
+				inq = (struct scsi_inquiry *)
+				    acmd->cmd_buf->b_un.b_addr;
+
+				/* don't expose physical drives to OS */
+				if (acmd->islogical &&
+				    (status == MFI_STAT_OK)) {
+					display_scsi_inquiry((caddr_t)inq);
+#ifdef PDSUPPORT
+				} else if ((status == MFI_STAT_OK) &&
+				    inq->inq_dtype == DTYPE_DIRECT) {
+					display_scsi_inquiry((caddr_t)inq);
+#endif
+				} else {
+					/* for physical disk */
+					status = MFI_STAT_DEVICE_NOT_FOUND;
+				}
+			}
+		}
+
+		switch (status) {
+		case MFI_STAT_OK:
+			pkt->pkt_scbp[0] = STATUS_GOOD;
+			break;
+		case MFI_STAT_LD_CC_IN_PROGRESS:
+		case MFI_STAT_LD_RECON_IN_PROGRESS:
+			pkt->pkt_scbp[0] = STATUS_GOOD;
+			break;
+		case MFI_STAT_LD_INIT_IN_PROGRESS:
+			pkt->pkt_reason	= CMD_TRAN_ERR;
+			break;
+		case MFI_STAT_SCSI_IO_FAILED:
+			cmn_err(CE_WARN, "tbolt_complete_cmd: scsi_io failed");
+			pkt->pkt_reason	= CMD_TRAN_ERR;
+			break;
+		case MFI_STAT_SCSI_DONE_WITH_ERROR:
+			con_log(CL_ANN, (CE_WARN,
+			    "tbolt_complete_cmd: scsi_done with error"));
+
+			pkt->pkt_reason	= CMD_CMPLT;
+			((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+
+			if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+				con_log(CL_ANN,
+				    (CE_WARN, "TEST_UNIT_READY fail"));
+			} else {
+				pkt->pkt_state |= STATE_ARQ_DONE;
+				arqstat = (void *)(pkt->pkt_scbp);
+				arqstat->sts_rqpkt_reason = CMD_CMPLT;
+				arqstat->sts_rqpkt_resid = 0;
+				arqstat->sts_rqpkt_state |=
+				    STATE_GOT_BUS | STATE_GOT_TARGET
+				    | STATE_SENT_CMD
+				    | STATE_XFERRED_DATA;
+				*(uint8_t *)&arqstat->sts_rqpkt_status =
+				    STATUS_GOOD;
+				con_log(CL_ANN1,
+				    (CE_NOTE, "Copying Sense data %x",
+				    cmd->SMID));
+
+				ddi_rep_get8(acc_handle,
+				    (uint8_t *)&(arqstat->sts_sensedata),
+				    cmd->sense1,
+				    sizeof (struct scsi_extended_sense),
+				    DDI_DEV_AUTOINCR);
+
+			}
+			break;
+		case MFI_STAT_LD_OFFLINE:
+			cmn_err(CE_WARN,
+			    "tbolt_complete_cmd: ld offline "
+			    "CDB[0]=0x%x targetId=0x%x devhandle=0x%x\n",
+			    /* UNDO: */
+			    ddi_get8(acc_handle, &scsi_raid_io->CDB.CDB32[0]),
+
+			    ddi_get16(acc_handle,
+			    &scsi_raid_io->RaidContext.ldTargetId),
+
+			    ddi_get16(acc_handle, &scsi_raid_io->DevHandle));
+
+			pkt->pkt_reason	= CMD_DEV_GONE;
+			pkt->pkt_statistics  = STAT_DISCON;
+			break;
+		case MFI_STAT_DEVICE_NOT_FOUND:
+			con_log(CL_ANN, (CE_CONT,
+			    "tbolt_complete_cmd: device not found error"));
+			pkt->pkt_reason	= CMD_DEV_GONE;
+			pkt->pkt_statistics  = STAT_DISCON;
+			break;
+
+		case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+			pkt->pkt_state |= STATE_ARQ_DONE;
+			pkt->pkt_reason	= CMD_CMPLT;
+			((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+
+			arqstat = (void *)(pkt->pkt_scbp);
+			arqstat->sts_rqpkt_reason = CMD_CMPLT;
+			arqstat->sts_rqpkt_resid = 0;
+			arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+			    | STATE_GOT_TARGET | STATE_SENT_CMD
+			    | STATE_XFERRED_DATA;
+			*(uint8_t *)&arqstat->sts_rqpkt_status = STATUS_GOOD;
+
+			arqstat->sts_sensedata.es_valid = 1;
+			arqstat->sts_sensedata.es_key = KEY_ILLEGAL_REQUEST;
+			arqstat->sts_sensedata.es_class = CLASS_EXTENDED_SENSE;
+
+			/*
+			 * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+			 * ASC: 0x21h; ASCQ: 0x00h;
+			 */
+			arqstat->sts_sensedata.es_add_code = 0x21;
+			arqstat->sts_sensedata.es_qual_code = 0x00;
+			break;
+		case MFI_STAT_INVALID_CMD:
+		case MFI_STAT_INVALID_DCMD:
+		case MFI_STAT_INVALID_PARAMETER:
+		case MFI_STAT_INVALID_SEQUENCE_NUMBER:
+		default:
+			cmn_err(CE_WARN, "tbolt_complete_cmd: Unknown status!");
+			pkt->pkt_reason	= CMD_TRAN_ERR;
+
+			break;
+		}
+
+		atomic_add_16(&instance->fw_outstanding, (-1));
+
+		(void) mrsas_common_check(instance, cmd);
+		if (acmd->cmd_dmahandle) {
+			if (mrsas_check_dma_handle(acmd->cmd_dmahandle) !=
+			    DDI_SUCCESS) {
+				ddi_fm_service_impact(instance->dip,
+				    DDI_SERVICE_UNAFFECTED);
+				pkt->pkt_reason = CMD_TRAN_ERR;
+				pkt->pkt_statistics = 0;
+			}
+		}
+
+		/* Call the callback routine */
+		if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp)
+			(*pkt->pkt_comp)(pkt);
+
+		con_log(CL_ANN1, (CE_NOTE, "Free smid %x", cmd->SMID));
+
+		ddi_put8(acc_handle, &scsi_raid_io->RaidContext.status, 0);
+
+		ddi_put8(acc_handle, &scsi_raid_io->RaidContext.extStatus, 0);
+
+		return_raid_msg_pkt(instance, cmd);
+		break;
+	}
+	case MPI2_FUNCTION_PASSTHRU_IO_REQUEST:	 /* MFA command. */
+
+		if (cmd->frame->dcmd.opcode == MR_DCMD_LD_MAP_GET_INFO &&
+		    cmd->frame->dcmd.mbox.b[1] == 1) {
+
+			mutex_enter(&instance->sync_map_mtx);
+
+			con_log(CL_ANN, (CE_NOTE,
+			    "LDMAP sync command	SMID RECEIVED 0x%X",
+			    cmd->SMID));
+			if (cmd->frame->hdr.cmd_status != 0) {
+				cmn_err(CE_WARN,
+				    "map sync failed, status = 0x%x.\n",
+				    cmd->frame->hdr.cmd_status);
+			} else {
+				instance->map_id++;
+				cmn_err(CE_NOTE,
+				    "map sync received, switched map_id to %"
+				    PRIu64 " \n", instance->map_id);
+			}
+
+			if (MR_ValidateMapInfo(instance->ld_map[
+			    (instance->map_id & 1)],
+			    instance->load_balance_info)) {
+				instance->fast_path_io = 1;
+			} else {
+				instance->fast_path_io = 0;
+			}
+
+			con_log(CL_ANN, (CE_NOTE,
+			    "instance->fast_path_io %d \n",
+			    instance->fast_path_io));
+
+			instance->unroll.syncCmd = 0;
+
+			if (instance->map_update_cmd == cmd) {
+				return_raid_msg_pkt(instance, cmd);
+				atomic_add_16(&instance->fw_outstanding, (-1));
+				(void) mrsas_tbolt_sync_map_info(instance);
+			}
+
+			cmn_err(CE_NOTE, "LDMAP sync completed.\n");
+			mutex_exit(&instance->sync_map_mtx);
+			break;
+		}
+
+		if (cmd->frame->dcmd.opcode == MR_DCMD_CTRL_EVENT_WAIT) {
+			con_log(CL_ANN1, (CE_CONT,
+			    "AEN command SMID RECEIVED 0x%X",
+			    cmd->SMID));
+			if ((instance->aen_cmd == cmd) &&
+			    (instance->aen_cmd->abort_aen)) {
+				con_log(CL_ANN, (CE_WARN, "mrsas_softintr: "
+				    "aborted_aen returned"));
+			} else {
+				atomic_add_16(&instance->fw_outstanding, (-1));
+				service_mfi_aen(instance, cmd);
+			}
+		}
+
+		if (cmd->sync_cmd == MRSAS_TRUE) {
+			con_log(CL_ANN1, (CE_CONT,
+			    "Sync-mode Command Response SMID RECEIVED 0x%X",
+			    cmd->SMID));
+
+			tbolt_complete_cmd_in_sync_mode(instance, cmd);
+		} else {
+			con_log(CL_ANN, (CE_CONT,
+			    "tbolt_complete_cmd: Wrong SMID RECEIVED 0x%X",
+			    cmd->SMID));
+		}
+		break;
+	default:
+		mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+		/* free message */
+		con_log(CL_ANN,
+		    (CE_NOTE, "tbolt_complete_cmd: Unknown Type!!!!!!!!"));
+		break;
+	}
+}
+
+uint_t
+mr_sas_tbolt_process_outstanding_cmd(struct mrsas_instance *instance)
+{
+	uint8_t				replyType;
+	Mpi2SCSIIOSuccessReplyDescriptor_t *replyDesc;
+	Mpi2ReplyDescriptorsUnion_t	*desc;
+	uint16_t			smid;
+	union desc_value		d_val;
+	struct mrsas_cmd		*cmd;
+
+	struct mrsas_header	*hdr;
+	struct scsi_pkt		*pkt;
+
+	(void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORDEV);
+
+	(void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORCPU);
+
+	desc = instance->reply_frame_pool;
+	desc += instance->reply_read_index;
+
+	replyDesc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *)desc;
+	replyType = replyDesc->ReplyFlags &
+	    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
+
+	if (replyType == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
+		return (DDI_INTR_UNCLAIMED);
+
+	if (mrsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+	    != DDI_SUCCESS) {
+		mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+		ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+		con_log(CL_ANN1,
+		    (CE_WARN, "mr_sas_tbolt_process_outstanding_cmd(): "
+		    "FMA check, returning DDI_INTR_UNCLAIMED"));
+		return (DDI_INTR_CLAIMED);
+	}
+
+	con_log(CL_ANN1, (CE_NOTE, "Reply Desc	= %p  Words = %" PRIx64 " \n",
+	    (void *)desc, desc->Words));
+
+	d_val.word = desc->Words;
+
+
+	/* Read Reply descriptor */
+	while ((d_val.u1.low != 0xffffffff) &&
+	    (d_val.u1.high != 0xffffffff)) {
+
+		(void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle,
+		    0, 0, DDI_DMA_SYNC_FORCPU);
+
+		smid = replyDesc->SMID;
+
+		if (!smid || smid > instance->max_fw_cmds + 1) {
+			con_log(CL_ANN1, (CE_NOTE,
+			    "Reply Desc at Break  = %p	Words = %" PRIx64 " \n",
+			    (void *)desc, desc->Words));
+			break;
+		}
+
+		cmd	= instance->cmd_list[smid - 1];
+		if (!cmd) {
+			con_log(CL_ANN1, (CE_NOTE, "mr_sas_tbolt_process_"
+			    "outstanding_cmd: Invalid command "
+			    " or Poll commad Received in completion path\n"));
+		} else {
+			mutex_enter(&instance->cmd_pend_mtx);
+			if (cmd->sync_cmd == MRSAS_TRUE) {
+				hdr = (struct mrsas_header *)&cmd->frame->hdr;
+				if (hdr) {
+					con_log(CL_ANN1, (CE_NOTE, "mr_sas_"
+					    "tbolt_process_outstanding_cmd:"
+					    " mlist_del_init(&cmd->list).\n"));
+					mlist_del_init(&cmd->list);
+				}
+			} else {
+				pkt = cmd->pkt;
+				if (pkt) {
+					con_log(CL_ANN1, (CE_NOTE, "mr_sas_"
+					    "tbolt_process_outstanding_cmd:"
+					    "mlist_del_init(&cmd->list).\n"));
+					mlist_del_init(&cmd->list);
+				}
+			}
+
+			mutex_exit(&instance->cmd_pend_mtx);
+
+			tbolt_complete_cmd(instance, cmd);
+		}
+		/* set it back to all 0xfffffffff. */
+		desc->Words = (uint64_t)~0;
+
+		instance->reply_read_index++;
+
+		if (instance->reply_read_index >= (instance->reply_q_depth)) {
+			con_log(CL_ANN1, (CE_NOTE, "wrap around"));
+			instance->reply_read_index = 0;
+		}
+
+		/* Get the next reply descriptor */
+		if (!instance->reply_read_index)
+			desc = instance->reply_frame_pool;
+		else
+			desc++;
+
+		replyDesc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *)desc;
+
+		d_val.word = desc->Words;
+
+		con_log(CL_ANN1, (CE_NOTE,
+		    "Next Reply Desc  = %p Words = %" PRIx64 "\n",
+		    (void *)desc, desc->Words));
+
+		replyType = replyDesc->ReplyFlags &
+		    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
+
+		if (replyType == MPI2_RPY_DESCRIPT_FLAGS_UNUSED)
+			break;
+
+	} /* End of while loop. */
+
+	/* update replyIndex to FW */
+	WR_MPI2_REPLY_POST_INDEX(instance->reply_read_index, instance);
+
+
+	(void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORDEV);
+
+	(void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle,
+	    0, 0, DDI_DMA_SYNC_FORCPU);
+	return (DDI_INTR_CLAIMED);
+}
+
+
+
+
+/*
+ * complete_cmd_in_sync_mode -	Completes an internal command
+ * @instance:			Adapter soft state
+ * @cmd:			Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+void
+tbolt_complete_cmd_in_sync_mode(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd)
+{
+
+	cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+	    &cmd->frame->io.cmd_status);
+
+	cmd->sync_cmd = MRSAS_FALSE;
+
+	mutex_enter(&instance->int_cmd_mtx);
+	if (cmd->cmd_status == ENODATA) {
+		cmd->cmd_status = 0;
+	}
+	cv_broadcast(&instance->int_cmd_cv);
+	mutex_exit(&instance->int_cmd_mtx);
+
+}
+
+/*
+ * mrsas_tbolt_get_ld_map_info -	Returns	 ld_map structure
+ * instance:				Adapter soft state
+ *
+ * Issues an internal command (DCMD) to get the FW's controller PD
+ * list structure.  This information is mainly used to find out SYSTEM
+ * supported by the FW.
+ */
+int
+mrsas_tbolt_get_ld_map_info(struct mrsas_instance *instance)
+{
+	int ret = 0;
+	struct mrsas_cmd	*cmd = NULL;
+	struct mrsas_dcmd_frame	*dcmd;
+	MR_FW_RAID_MAP_ALL *ci;
+	uint32_t ci_h = 0;
+	U32 size_map_info;
+
+	cmd = get_raid_msg_pkt(instance);
+
+	if (cmd == NULL) {
+		cmn_err(CE_WARN,
+		    "Failed to get a cmd from free-pool in get_ld_map_info()");
+		return (DDI_FAILURE);
+	}
+
+	dcmd = &cmd->frame->dcmd;
+
+	size_map_info =	sizeof (MR_FW_RAID_MAP) +
+	    (sizeof (MR_LD_SPAN_MAP) *
+	    (MAX_LOGICAL_DRIVES - 1));
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "size_map_info : 0x%x", size_map_info));
+
+	ci = instance->ld_map[(instance->map_id & 1)];
+	ci_h = instance->ld_map_phy[(instance->map_id & 1)];
+
+	if (!ci) {
+		cmn_err(CE_WARN, "Failed to alloc mem for ld_map_info");
+		return_raid_msg_pkt(instance, cmd);
+		return (-1);
+	}
+
+	(void) memset(ci, 0, sizeof (*ci));
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	dcmd->cmd = MFI_CMD_OP_DCMD;
+	dcmd->cmd_status = 0xFF;
+	dcmd->sge_count = 1;
+	dcmd->flags = MFI_FRAME_DIR_READ;
+	dcmd->timeout = 0;
+	dcmd->pad_0 = 0;
+	dcmd->data_xfer_len = size_map_info;
+	dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO;
+	dcmd->sgl.sge32[0].phys_addr = ci_h;
+	dcmd->sgl.sge32[0].length = size_map_info;
+
+
+	mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+
+	if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		ret = 0;
+		con_log(CL_ANN1, (CE_NOTE,
+		    "Get LD Map Info success\n"));
+	} else {
+		cmn_err(CE_WARN,
+		    "Get LD Map Info failed\n");
+		ret = -1;
+	}
+
+	return_raid_msg_pkt(instance, cmd);
+
+	return (ret);
+}
+
+void
+mrsas_dump_reply_desc(struct mrsas_instance *instance)
+{
+	uint32_t i;
+	MPI2_REPLY_DESCRIPTORS_UNION *reply_desc;
+	union desc_value d_val;
+
+	reply_desc = instance->reply_frame_pool;
+
+	for (i = 0; i < instance->reply_q_depth; i++, reply_desc++) {
+		d_val.word = reply_desc->Words;
+		con_log(CL_DLEVEL3, (CE_NOTE,
+		    "i=%d, %x:%x",
+		    i, d_val.u1.high, d_val.u1.low));
+	}
+}
+
+/*
+ * mrsas_tbolt_command_create -	Create command for fast path.
+ * @io_info:	MegaRAID IO request packet pointer.
+ * @ref_tag:	Reference tag for RD/WRPROTECT
+ *
+ * Create the command for fast path.
+ */
+void
+mrsas_tbolt_prepare_cdb(struct mrsas_instance *instance, U8 cdb[],
+    struct IO_REQUEST_INFO *io_info, Mpi2RaidSCSIIORequest_t *scsi_io_request,
+    U32 ref_tag)
+{
+	uint16_t		EEDPFlags;
+	uint32_t		Control;
+	ddi_acc_handle_t acc_handle =
+	    instance->mpi2_frame_pool_dma_obj.acc_handle;
+
+	/* Prepare 32-byte CDB if DIF is supported on this device */
+	con_log(CL_ANN, (CE_NOTE, "Prepare DIF CDB\n"));
+
+	(void) memset(cdb, 0, 32);
+
+	cdb[0] =  MRSAS_SCSI_VARIABLE_LENGTH_CMD;
+
+
+	cdb[7] =  MRSAS_SCSI_ADDL_CDB_LEN;
+
+	if (io_info->isRead)
+		cdb[9] = MRSAS_SCSI_SERVICE_ACTION_READ32;
+	else
+		cdb[9] = MRSAS_SCSI_SERVICE_ACTION_WRITE32;
+
+	/* Verify within linux driver, set to MEGASAS_RD_WR_PROTECT_CHECK_ALL */
+	cdb[10] = MRSAS_RD_WR_PROTECT;
+
+	/* LOGICAL BLOCK ADDRESS */
+	cdb[12] = (U8)(((io_info->pdBlock) >> 56) & 0xff);
+	cdb[13] = (U8)(((io_info->pdBlock) >> 48) & 0xff);
+	cdb[14] = (U8)(((io_info->pdBlock) >> 40) & 0xff);
+	cdb[15] = (U8)(((io_info->pdBlock) >> 32) & 0xff);
+	cdb[16] = (U8)(((io_info->pdBlock) >> 24) & 0xff);
+	cdb[17] = (U8)(((io_info->pdBlock) >> 16) & 0xff);
+	cdb[18] = (U8)(((io_info->pdBlock) >> 8) & 0xff);
+	cdb[19] = (U8)((io_info->pdBlock) & 0xff);
+
+	/* Logical block reference tag */
+	ddi_put32(acc_handle, &scsi_io_request->CDB.EEDP32.PrimaryReferenceTag,
+	    BIG_ENDIAN(ref_tag));
+
+	ddi_put16(acc_handle,
+	    &scsi_io_request->CDB.EEDP32.PrimaryApplicationTagMask, 0xffff);
+
+	ddi_put32(acc_handle, &scsi_io_request->DataLength,
+	    ((io_info->numBlocks)*512));
+	/* Specify 32-byte cdb */
+	ddi_put16(acc_handle, &scsi_io_request->IoFlags, 32);
+
+	/* Transfer length */
+	cdb[28] = (U8)(((io_info->numBlocks) >> 24) & 0xff);
+	cdb[29] = (U8)(((io_info->numBlocks) >> 16) & 0xff);
+	cdb[30] = (U8)(((io_info->numBlocks) >> 8) & 0xff);
+	cdb[31] = (U8)((io_info->numBlocks) & 0xff);
+
+	/* set SCSI IO EEDPFlags */
+	EEDPFlags = ddi_get16(acc_handle, &scsi_io_request->EEDPFlags);
+	Control = ddi_get32(acc_handle, &scsi_io_request->Control);
+
+	/* set SCSI IO EEDPFlags bits */
+	if (io_info->isRead) {
+		/*
+		 * For READ commands, the EEDPFlags shall be set to specify to
+		 * Increment the Primary Reference Tag, to Check the Reference
+		 * Tag, and to Check and Remove the Protection Information
+		 * fields.
+		 */
+		EEDPFlags = MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG	|
+		    MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG	|
+		    MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP	|
+		    MPI2_SCSIIO_EEDPFLAGS_CHECK_APPTAG	|
+		    MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
+	} else {
+		/*
+		 * For WRITE commands, the EEDPFlags shall be set to specify to
+		 * Increment the Primary Reference Tag, and to Insert
+		 * Protection Information fields.
+		 */
+		EEDPFlags = MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG	|
+		    MPI2_SCSIIO_EEDPFLAGS_INSERT_OP;
+	}
+	Control |= (0x4 << 26);
+
+	ddi_put16(acc_handle, &scsi_io_request->EEDPFlags, EEDPFlags);
+	ddi_put32(acc_handle, &scsi_io_request->Control, Control);
+	ddi_put32(acc_handle,
+	    &scsi_io_request->EEDPBlockSize, MRSAS_EEDPBLOCKSIZE);
+}
+
+
+/*
+ * mrsas_tbolt_set_pd_lba -	Sets PD LBA
+ * @cdb:		CDB
+ * @cdb_len:		cdb length
+ * @start_blk:		Start block of IO
+ *
+ * Used to set the PD LBA in CDB for FP IOs
+ */
+static void
+mrsas_tbolt_set_pd_lba(U8 cdb[], uint8_t *cdb_len_ptr, U64 start_blk,
+    U32 num_blocks)
+{
+	U8 cdb_len = *cdb_len_ptr;
+	U8 flagvals = 0, opcode = 0, groupnum = 0, control = 0;
+
+	/* Some drives don't support 16/12 byte CDB's, convert to 10 */
+	if (((cdb_len == 12) || (cdb_len == 16)) &&
+	    (start_blk <= 0xffffffff)) {
+		if (cdb_len == 16) {
+			con_log(CL_ANN,
+			    (CE_NOTE, "Converting READ/WRITE(16) to READ10\n"));
+			opcode = cdb[0] == READ_16 ? READ_10 : WRITE_10;
+			flagvals = cdb[1];
+			groupnum = cdb[14];
+			control = cdb[15];
+		} else {
+			con_log(CL_ANN,
+			    (CE_NOTE, "Converting READ/WRITE(12) to READ10\n"));
+			opcode = cdb[0] == READ_12 ? READ_10 : WRITE_10;
+			flagvals = cdb[1];
+			groupnum = cdb[10];
+			control = cdb[11];
+		}
+
+		(void) memset(cdb, 0, sizeof (cdb));
+
+		cdb[0] = opcode;
+		cdb[1] = flagvals;
+		cdb[6] = groupnum;
+		cdb[9] = control;
+		/* Set transfer length */
+		cdb[8] = (U8)(num_blocks & 0xff);
+		cdb[7] = (U8)((num_blocks >> 8) & 0xff);
+		cdb_len = 10;
+	} else if ((cdb_len < 16) && (start_blk > 0xffffffff)) {
+		/* Convert to 16 byte CDB for large LBA's */
+		con_log(CL_ANN,
+		    (CE_NOTE, "Converting 6/10/12 CDB to 16 byte CDB\n"));
+		switch (cdb_len) {
+		case 6:
+			opcode = cdb[0] == READ_6 ? READ_16 : WRITE_16;
+			control = cdb[5];
+			break;
+		case 10:
+			opcode = cdb[0] == READ_10 ? READ_16 : WRITE_16;
+			flagvals = cdb[1];
+			groupnum = cdb[6];
+			control = cdb[9];
+			break;
+		case 12:
+			opcode = cdb[0] == READ_12 ? READ_16 : WRITE_16;
+			flagvals = cdb[1];
+			groupnum = cdb[10];
+			control = cdb[11];
+			break;
+		}
+
+		(void) memset(cdb, 0, sizeof (cdb));
+
+		cdb[0] = opcode;
+		cdb[1] = flagvals;
+		cdb[14] = groupnum;
+		cdb[15] = control;
+
+		/* Transfer length */
+		cdb[13] = (U8)(num_blocks & 0xff);
+		cdb[12] = (U8)((num_blocks >> 8) & 0xff);
+		cdb[11] = (U8)((num_blocks >> 16) & 0xff);
+		cdb[10] = (U8)((num_blocks >> 24) & 0xff);
+
+		/* Specify 16-byte cdb */
+		cdb_len = 16;
+	} else if ((cdb_len == 6) && (start_blk > 0x1fffff)) {
+		/* convert to 10 byte CDB */
+		opcode = cdb[0] == READ_6 ? READ_10 : WRITE_10;
+		control = cdb[5];
+
+		(void) memset(cdb, 0, sizeof (cdb));
+		cdb[0] = opcode;
+		cdb[9] = control;
+
+		/* Set transfer length */
+		cdb[8] = (U8)(num_blocks & 0xff);
+		cdb[7] = (U8)((num_blocks >> 8) & 0xff);
+
+		/* Specify 10-byte cdb */
+		cdb_len = 10;
+	}
+
+
+	/* Fall through Normal case, just load LBA here */
+	switch (cdb_len) {
+	case 6:
+	{
+		U8 val = cdb[1] & 0xE0;
+		cdb[3] = (U8)(start_blk & 0xff);
+		cdb[2] = (U8)((start_blk >> 8) & 0xff);
+		cdb[1] = val | ((U8)(start_blk >> 16) & 0x1f);
+		break;
+	}
+	case 10:
+		cdb[5] = (U8)(start_blk & 0xff);
+		cdb[4] = (U8)((start_blk >> 8) & 0xff);
+		cdb[3] = (U8)((start_blk >> 16) & 0xff);
+		cdb[2] = (U8)((start_blk >> 24) & 0xff);
+		break;
+	case 12:
+		cdb[5]	  = (U8)(start_blk & 0xff);
+		cdb[4]	  = (U8)((start_blk >> 8) & 0xff);
+		cdb[3]	  = (U8)((start_blk >> 16) & 0xff);
+		cdb[2]	  = (U8)((start_blk >> 24) & 0xff);
+		break;
+
+	case 16:
+		cdb[9]	= (U8)(start_blk & 0xff);
+		cdb[8]	= (U8)((start_blk >> 8) & 0xff);
+		cdb[7]	= (U8)((start_blk >> 16) & 0xff);
+		cdb[6]	= (U8)((start_blk >> 24) & 0xff);
+		cdb[5]	= (U8)((start_blk >> 32) & 0xff);
+		cdb[4]	= (U8)((start_blk >> 40) & 0xff);
+		cdb[3]	= (U8)((start_blk >> 48) & 0xff);
+		cdb[2]	= (U8)((start_blk >> 56) & 0xff);
+		break;
+	}
+
+	*cdb_len_ptr = cdb_len;
+}
+
+
+static int
+mrsas_tbolt_check_map_info(struct mrsas_instance *instance)
+{
+	MR_FW_RAID_MAP_ALL *ld_map;
+
+	if (!mrsas_tbolt_get_ld_map_info(instance)) {
+
+		ld_map = instance->ld_map[(instance->map_id & 1)];
+
+		con_log(CL_ANN1, (CE_NOTE, "ldCount=%d, map size=%d\n",
+		    ld_map->raidMap.ldCount, ld_map->raidMap.totalSize));
+
+		if (MR_ValidateMapInfo(instance->ld_map[
+		    (instance->map_id & 1)], instance->load_balance_info)) {
+			con_log(CL_ANN,
+			    (CE_CONT, "MR_ValidateMapInfo success"));
+
+			instance->fast_path_io = 1;
+			con_log(CL_ANN,
+			    (CE_NOTE, "instance->fast_path_io %d \n",
+			    instance->fast_path_io));
+
+			return (DDI_SUCCESS);
+		}
+
+	}
+
+	instance->fast_path_io = 0;
+	cmn_err(CE_WARN, "MR_ValidateMapInfo failed");
+	con_log(CL_ANN, (CE_NOTE,
+	    "instance->fast_path_io %d \n", instance->fast_path_io));
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * Marks HBA as bad. This will be called either when an
+ * IO packet times out even after 3 FW resets
+ * or FW is found to be fault even after 3 continuous resets.
+ */
+
+void
+mrsas_tbolt_kill_adapter(struct mrsas_instance *instance)
+{
+	cmn_err(CE_WARN, "TBOLT Kill adapter called\n");
+
+	if (instance->deadadapter == 1)
+		return;
+
+	con_log(CL_ANN1, (CE_NOTE, "tbolt_kill_adapter: "
+	    "Writing to doorbell with MFI_STOP_ADP "));
+	mutex_enter(&instance->ocr_flags_mtx);
+	instance->deadadapter = 1;
+	mutex_exit(&instance->ocr_flags_mtx);
+	instance->func_ptr->disable_intr(instance);
+	WR_RESERVED0_REGISTER(MFI_STOP_ADP, instance);
+	/* Flush */
+	(void) RD_RESERVED0_REGISTER(instance);
+
+	(void) mrsas_print_pending_cmds(instance);
+	(void) mrsas_complete_pending_cmds(instance);
+}
+
+void
+mrsas_reset_reply_desc(struct mrsas_instance *instance)
+{
+	int i;
+	MPI2_REPLY_DESCRIPTORS_UNION *reply_desc;
+	instance->reply_read_index = 0;
+
+	/* initializing reply address to 0xFFFFFFFF */
+	reply_desc = instance->reply_frame_pool;
+
+	for (i = 0; i < instance->reply_q_depth; i++) {
+		reply_desc->Words = (uint64_t)~0;
+		reply_desc++;
+	}
+}
+
+int
+mrsas_tbolt_reset_ppc(struct mrsas_instance *instance)
+{
+	uint32_t status = 0x00;
+	uint32_t retry = 0;
+	uint32_t cur_abs_reg_val;
+	uint32_t fw_state;
+	uint32_t abs_state;
+	uint32_t i;
+
+	con_log(CL_ANN, (CE_NOTE,
+	    "mrsas_tbolt_reset_ppc entered\n "));
+
+	if (instance->deadadapter == 1) {
+		cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: "
+		    "no more resets as HBA has been marked dead ");
+		return (DDI_FAILURE);
+	}
+
+	mutex_enter(&instance->ocr_flags_mtx);
+	instance->adapterresetinprogress = 1;
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc:"
+	    "adpterresetinprogress flag set, time %llx", gethrtime()));
+	mutex_exit(&instance->ocr_flags_mtx);
+
+	instance->func_ptr->disable_intr(instance);
+
+	/* Add delay inorder to complete the ioctl & io cmds in-flight */
+	for (i = 0; i < 3000; i++) {
+		drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+	}
+
+	instance->reply_read_index = 0;
+
+retry_reset:
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    ":Resetting TBOLT "));
+
+	WR_TBOLT_IB_WRITE_SEQ(0xF, instance);
+	WR_TBOLT_IB_WRITE_SEQ(4, instance);
+	WR_TBOLT_IB_WRITE_SEQ(0xb, instance);
+	WR_TBOLT_IB_WRITE_SEQ(2, instance);
+	WR_TBOLT_IB_WRITE_SEQ(7, instance);
+	WR_TBOLT_IB_WRITE_SEQ(0xd, instance);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mrsas_tbolt_reset_ppc: magic number written "
+	    "to write sequence register\n"));
+	delay(100 * drv_usectohz(MILLISEC));
+	status = RD_TBOLT_HOST_DIAG(instance);
+	con_log(CL_ANN1, (CE_NOTE,
+	    "mrsas_tbolt_reset_ppc: READ HOSTDIAG SUCCESS "
+	    "to write sequence register\n"));
+
+	while (status & DIAG_TBOLT_RESET_ADAPTER) {
+		delay(100 * drv_usectohz(MILLISEC));
+		status = RD_TBOLT_HOST_DIAG(instance);
+		if (retry++ == 100) {
+			cmn_err(CE_WARN,
+			    "mrsas_tbolt_reset_ppc:"
+			    "resetadapter bit is set already "
+			    "check retry count %d\n", retry);
+			return (DDI_FAILURE);
+		}
+	}
+
+	WR_TBOLT_HOST_DIAG(status | DIAG_TBOLT_RESET_ADAPTER, instance);
+	delay(100 * drv_usectohz(MILLISEC));
+
+	ddi_rep_get8((instance)->regmap_handle, (uint8_t *)&status,
+	    (uint8_t *)((uintptr_t)(instance)->regmap +
+	    RESET_TBOLT_STATUS_OFF), 4, DDI_DEV_AUTOINCR);
+
+	while ((status & DIAG_TBOLT_RESET_ADAPTER)) {
+		delay(100 * drv_usectohz(MILLISEC));
+		ddi_rep_get8((instance)->regmap_handle, (uint8_t *)&status,
+		    (uint8_t *)((uintptr_t)(instance)->regmap +
+		    RESET_TBOLT_STATUS_OFF), 4, DDI_DEV_AUTOINCR);
+		if (retry++ == 100) {
+			/* Dont call kill adapter here */
+			/* RESET BIT ADAPTER is cleared by firmare */
+			/* mrsas_tbolt_kill_adapter(instance); */
+			cmn_err(CE_WARN,
+			    "mr_sas %d: %s(): RESET FAILED; return failure!!!",
+			    instance->instance, __func__);
+			return (DDI_FAILURE);
+		}
+	}
+
+	con_log(CL_ANN,
+	    (CE_NOTE, "mrsas_tbolt_reset_ppc: Adapter reset complete"));
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "Calling mfi_state_transition_to_ready"));
+
+	abs_state = instance->func_ptr->read_fw_status_reg(instance);
+	retry = 0;
+	while ((abs_state <= MFI_STATE_FW_INIT) && (retry++ < 1000)) {
+		delay(100 * drv_usectohz(MILLISEC));
+		abs_state = instance->func_ptr->read_fw_status_reg(instance);
+	}
+	if (abs_state <= MFI_STATE_FW_INIT) {
+		cmn_err(CE_WARN,
+		    "mrsas_tbolt_reset_ppc: firmware state < MFI_STATE_FW_INIT"
+		    "state = 0x%x, RETRY RESET.\n", abs_state);
+		goto retry_reset;
+	}
+
+	/* Mark HBA as bad, if FW is fault after 3 continuous resets */
+	if (mfi_state_transition_to_ready(instance) ||
+	    debug_tbolt_fw_faults_after_ocr_g == 1) {
+		cur_abs_reg_val =
+		    instance->func_ptr->read_fw_status_reg(instance);
+		fw_state	= cur_abs_reg_val & MFI_STATE_MASK;
+
+		con_log(CL_ANN1, (CE_NOTE,
+		    "mrsas_tbolt_reset_ppc :before fake: FW is not ready "
+		    "FW state = 0x%x", fw_state));
+		if (debug_tbolt_fw_faults_after_ocr_g == 1)
+			fw_state = MFI_STATE_FAULT;
+
+		con_log(CL_ANN,
+		    (CE_NOTE,  "mrsas_tbolt_reset_ppc : FW is not ready "
+		    "FW state = 0x%x", fw_state));
+
+		if (fw_state == MFI_STATE_FAULT) {
+			/* increment the count */
+			instance->fw_fault_count_after_ocr++;
+			if (instance->fw_fault_count_after_ocr
+			    < MAX_FW_RESET_COUNT) {
+				cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: "
+				    "FW is in fault after OCR count %d "
+				    "Retry Reset",
+				    instance->fw_fault_count_after_ocr);
+				goto retry_reset;
+
+			} else {
+				cmn_err(CE_WARN, "mrsas %d: %s:"
+				    "Max Reset Count exceeded >%d"
+				    "Mark HBA as bad, KILL adapter",
+				    instance->instance, __func__,
+				    MAX_FW_RESET_COUNT);
+
+				mrsas_tbolt_kill_adapter(instance);
+				return (DDI_FAILURE);
+			}
+		}
+	}
+
+	/* reset the counter as FW is up after OCR */
+	instance->fw_fault_count_after_ocr = 0;
+
+	mrsas_reset_reply_desc(instance);
+
+
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "Calling mrsas_issue_init_mpi2"));
+	abs_state = mrsas_issue_init_mpi2(instance);
+	if (abs_state == (uint32_t)DDI_FAILURE) {
+		cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: "
+		    "INIT failed Retrying Reset");
+		goto retry_reset;
+	}
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "mrsas_issue_init_mpi2 Done"));
+
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "Calling mrsas_print_pending_cmd\n"));
+	(void) mrsas_print_pending_cmds(instance);
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "mrsas_print_pending_cmd done\n"));
+
+	instance->func_ptr->enable_intr(instance);
+	instance->fw_outstanding = 0;
+
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "Calling mrsas_issue_pending_cmds"));
+	(void) mrsas_issue_pending_cmds(instance);
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	"issue_pending_cmds done.\n"));
+
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "Calling aen registration"));
+
+	instance->aen_cmd->retry_count_for_ocr = 0;
+	instance->aen_cmd->drv_pkt_time = 0;
+
+	instance->func_ptr->issue_cmd(instance->aen_cmd, instance);
+
+	con_log(CL_ANN1, (CE_NOTE, "Unsetting adpresetinprogress flag.\n"));
+	mutex_enter(&instance->ocr_flags_mtx);
+	instance->adapterresetinprogress = 0;
+	mutex_exit(&instance->ocr_flags_mtx);
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: "
+	    "adpterresetinprogress flag unset"));
+
+	con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc done\n"));
+	return (DDI_SUCCESS);
+
+}
+
+
+/*
+ * mrsas_sync_map_info -	Returns FW's ld_map structure
+ * @instance:				Adapter soft state
+ *
+ * Issues an internal command (DCMD) to get the FW's controller PD
+ * list structure.  This information is mainly used to find out SYSTEM
+ * supported by the FW.
+ */
+
+static int
+mrsas_tbolt_sync_map_info(struct mrsas_instance *instance)
+{
+	int			ret = 0, i;
+	struct mrsas_cmd	*cmd = NULL;
+	struct mrsas_dcmd_frame	*dcmd;
+	uint32_t size_sync_info, num_lds;
+	LD_TARGET_SYNC *ci = NULL;
+	MR_FW_RAID_MAP_ALL *map;
+	MR_LD_RAID  *raid;
+	LD_TARGET_SYNC *ld_sync;
+	uint32_t ci_h = 0;
+	uint32_t size_map_info;
+
+	cmd = get_raid_msg_pkt(instance);
+
+	if (cmd == NULL) {
+		cmn_err(CE_WARN, "Failed to get a cmd from free-pool in "
+		    "mrsas_tbolt_sync_map_info(). ");
+		return (DDI_FAILURE);
+	}
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+	bzero(cmd->scsi_io_request, sizeof (Mpi2RaidSCSIIORequest_t));
+
+
+	map = instance->ld_map[instance->map_id & 1];
+
+	num_lds = map->raidMap.ldCount;
+
+	dcmd = &cmd->frame->dcmd;
+
+	size_sync_info = sizeof (LD_TARGET_SYNC) * num_lds;
+
+	con_log(CL_ANN, (CE_NOTE, "size_sync_info =0x%x ; ld count = 0x%x \n ",
+	    size_sync_info, num_lds));
+
+	ci = (LD_TARGET_SYNC *)instance->ld_map[(instance->map_id - 1) & 1];
+
+	(void) memset(ci, 0, sizeof (MR_FW_RAID_MAP_ALL));
+	ci_h = instance->ld_map_phy[(instance->map_id - 1) & 1];
+
+	(void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+	ld_sync = (LD_TARGET_SYNC *)ci;
+
+	for (i = 0; i < num_lds; i++, ld_sync++) {
+		raid = MR_LdRaidGet(i, map);
+
+		con_log(CL_ANN1,
+		    (CE_NOTE, "i : 0x%x, Seq Num : 0x%x, Sync Reqd : 0x%x\n",
+		    i, raid->seqNum, raid->flags.ldSyncRequired));
+
+		ld_sync->ldTargetId = MR_GetLDTgtId(i, map);
+
+		con_log(CL_ANN1, (CE_NOTE, "i : 0x%x, tgt : 0x%x \n",
+		    i, ld_sync->ldTargetId));
+
+		ld_sync->seqNum = raid->seqNum;
+	}
+
+
+	size_map_info = sizeof (MR_FW_RAID_MAP) +
+	    (sizeof (MR_LD_SPAN_MAP) * (MAX_LOGICAL_DRIVES - 1));
+
+	dcmd->cmd = MFI_CMD_OP_DCMD;
+	dcmd->cmd_status = 0xFF;
+	dcmd->sge_count = 1;
+	dcmd->flags = MFI_FRAME_DIR_WRITE;
+	dcmd->timeout = 0;
+	dcmd->pad_0 = 0;
+	dcmd->data_xfer_len = size_map_info;
+	ASSERT(num_lds <= 255);
+	dcmd->mbox.b[0] = (U8)num_lds;
+	dcmd->mbox.b[1] = 1; /* Pend */
+	dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO;
+	dcmd->sgl.sge32[0].phys_addr = ci_h;
+	dcmd->sgl.sge32[0].length = size_map_info;
+
+
+	instance->map_update_cmd = cmd;
+	mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+
+	instance->func_ptr->issue_cmd(cmd, instance);
+
+	instance->unroll.syncCmd = 1;
+	con_log(CL_ANN1, (CE_NOTE, "sync cmd issued. [SMID]:%x", cmd->SMID));
+
+	return (ret);
+}
+
+/*
+ * abort_syncmap_cmd
+ */
+int
+abort_syncmap_cmd(struct mrsas_instance *instance,
+    struct mrsas_cmd *cmd_to_abort)
+{
+	int	ret = 0;
+
+	struct mrsas_cmd		*cmd;
+	struct mrsas_abort_frame	*abort_fr;
+
+	con_log(CL_ANN1, (CE_NOTE, "chkpnt: abort_ldsync:%d", __LINE__));
+
+	cmd = get_raid_msg_mfi_pkt(instance);
+
+	if (!cmd) {
+		cmn_err(CE_WARN,
+		    "Failed to get a cmd from free-pool abort_syncmap_cmd().");
+		return (DDI_FAILURE);
+	}
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+	abort_fr = &cmd->frame->abort;
+
+	/* prepare and issue the abort frame */
+	ddi_put8(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->cmd, MFI_CMD_OP_ABORT);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+	    MFI_CMD_STATUS_SYNC_MODE);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+	    cmd_to_abort->index);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+	ddi_put32(cmd->frame_dma_obj.acc_handle,
+	    &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+	cmd->frame_count = 1;
+
+	mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+
+	if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+		con_log(CL_ANN1, (CE_WARN,
+		    "abort_ldsync_cmd: issue_cmd_in_poll_mode failed"));
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	return_raid_msg_mfi_pkt(instance, cmd);
+
+	atomic_add_16(&instance->fw_outstanding, (-1));
+
+	return (ret);
+}
+
+
+#ifdef PDSUPPORT
+int
+mrsas_tbolt_config_pd(struct mrsas_instance *instance, uint16_t tgt,
+    uint8_t lun, dev_info_t **ldip)
+{
+	struct scsi_device *sd;
+	dev_info_t *child;
+	int rval, dtype;
+	struct mrsas_tbolt_pd_info *pds = NULL;
+
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_config_pd: t = %d l = %d",
+	    tgt, lun));
+
+	if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) {
+		if (ldip) {
+			*ldip = child;
+		}
+		if (instance->mr_tbolt_pd_list[tgt].flag != MRDRV_TGT_VALID) {
+			rval = mrsas_service_evt(instance, tgt, 1,
+			    MRSAS_EVT_UNCONFIG_TGT, NULL);
+			con_log(CL_ANN1, (CE_WARN,
+			    "mr_sas:DELETING STALE ENTRY  rval = %d "
+			    "tgt id = %d ", rval, tgt));
+			return (NDI_FAILURE);
+		}
+		return (NDI_SUCCESS);
+	}
+
+	pds = (struct mrsas_tbolt_pd_info *)
+	    kmem_zalloc(sizeof (struct mrsas_tbolt_pd_info), KM_SLEEP);
+	mrsas_tbolt_get_pd_info(instance, pds, tgt);
+	dtype = pds->scsiDevType;
+
+	/* Check for Disk */
+	if ((dtype == DTYPE_DIRECT)) {
+		if ((dtype == DTYPE_DIRECT) &&
+		    (LE_16(pds->fwState) != PD_SYSTEM)) {
+			kmem_free(pds, sizeof (struct mrsas_tbolt_pd_info));
+			return (NDI_FAILURE);
+		}
+		sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+		sd->sd_address.a_hba_tran = instance->tran;
+		sd->sd_address.a_target = (uint16_t)tgt;
+		sd->sd_address.a_lun = (uint8_t)lun;
+
+		if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) {
+			rval = mrsas_config_scsi_device(instance, sd, ldip);
+			con_log(CL_DLEVEL1, (CE_NOTE,
+			    "Phys. device found: tgt %d dtype %d: %s",
+			    tgt, dtype, sd->sd_inq->inq_vid));
+		} else {
+			rval = NDI_FAILURE;
+			con_log(CL_DLEVEL1, (CE_NOTE, "Phys. device Not found "
+			    "scsi_hba_probe Failed: tgt %d dtype %d: %s",
+			    tgt, dtype, sd->sd_inq->inq_vid));
+		}
+
+		/* sd_unprobe is blank now. Free buffer manually */
+		if (sd->sd_inq) {
+			kmem_free(sd->sd_inq, SUN_INQSIZE);
+			sd->sd_inq = (struct scsi_inquiry *)NULL;
+		}
+		kmem_free(sd, sizeof (struct scsi_device));
+		rval = NDI_SUCCESS;
+	} else {
+		con_log(CL_ANN1, (CE_NOTE,
+		    "Device not supported: tgt %d lun %d dtype %d",
+		    tgt, lun, dtype));
+		rval = NDI_FAILURE;
+	}
+
+	kmem_free(pds, sizeof (struct mrsas_tbolt_pd_info));
+	con_log(CL_ANN1, (CE_NOTE, "mrsas_config_pd: return rval = %d",
+	    rval));
+	return (rval);
+}
+
+static void
+mrsas_tbolt_get_pd_info(struct mrsas_instance *instance,
+    struct mrsas_tbolt_pd_info *pds, int tgt)
+{
+	struct mrsas_cmd	*cmd;
+	struct mrsas_dcmd_frame	*dcmd;
+	dma_obj_t		dcmd_dma_obj;
+
+	cmd = get_raid_msg_pkt(instance);
+
+	if (!cmd) {
+		con_log(CL_ANN1,
+		    (CE_WARN, "Failed to get a cmd for get pd info"));
+		return;
+	}
+
+	/* Clear the frame buffer and assign back the context id */
+	(void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+	    cmd->index);
+
+
+	dcmd = &cmd->frame->dcmd;
+	dcmd_dma_obj.size = sizeof (struct mrsas_tbolt_pd_info);
+	dcmd_dma_obj.dma_attr = mrsas_generic_dma_attr;
+	dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xffffffff;
+	dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xffffffff;
+	dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+	dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+	(void) mrsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+	    DDI_STRUCTURE_LE_ACC);
+	(void) memset(dcmd_dma_obj.buffer, 0,
+	    sizeof (struct mrsas_tbolt_pd_info));
+	(void) memset(dcmd->mbox.b, 0, 12);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+	ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+	    MFI_FRAME_DIR_READ);
+	ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+	    sizeof (struct mrsas_tbolt_pd_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+	    MR_DCMD_PD_GET_INFO);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], tgt);
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+	    sizeof (struct mrsas_tbolt_pd_info));
+	ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+	    dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+	cmd->sync_cmd = MRSAS_TRUE;
+	cmd->frame_count = 1;
+
+	if (instance->tbolt) {
+		mr_sas_tbolt_build_mfi_cmd(instance, cmd);
+	}
+
+	instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd);
+
+	ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)pds,
+	    (uint8_t *)dcmd_dma_obj.buffer, sizeof (struct mrsas_tbolt_pd_info),
+	    DDI_DEV_AUTOINCR);
+	(void) mrsas_free_dma_obj(instance, dcmd_dma_obj);
+	return_raid_msg_pkt(instance, cmd);
+}
+#endif
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index 501bca39c8..a1edcc664c 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -26,6 +26,7 @@
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011 Bayard G. Bell.  All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  */
 /*
  * Copyright 2011 cyril.galibern@opensvc.com
@@ -3502,9 +3503,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
 		 * according to the successful response to the page
 		 * 0x2A mode sense request.
 		 */
-		scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
-		    "sd_set_mmc_caps: Mode Sense returned "
-		    "invalid block descriptor length\n");
+		/*
+		 * The following warning occurs due to the KVM CD-ROM
+		 * mishandling the multi-media commands.  Ignore it.
+		 * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+		 *     "sd_set_mmc_caps: Mode Sense returned "
+		 *     "invalid block descriptor length\n");
+		 */
 		kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
 		return;
 	}
@@ -4441,18 +4446,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen)
 {
 	struct scsi_inquiry	*sd_inq;
 	int 			rval = SD_SUCCESS;
+	char			*p;
+	int			chk_vidlen = 0, chk_pidlen = 0;
+	int			has_tail = 0;
+	static const int	VSZ = sizeof (sd_inq->inq_vid);
+	static const int	PSZ = sizeof (sd_inq->inq_pid);
 
 	ASSERT(un != NULL);
 	sd_inq = un->un_sd->sd_inq;
 	ASSERT(id != NULL);
 
 	/*
-	 * We use the inq_vid as a pointer to a buffer containing the
-	 * vid and pid and use the entire vid/pid length of the table
-	 * entry for the comparison. This works because the inq_pid
-	 * data member follows inq_vid in the scsi_inquiry structure.
+	 * We would like to use the inq_vid as a pointer to a buffer
+	 * containing the vid and pid and use the entire vid/pid length of
+	 * the table entry for the comparison.  However, this does not work
+	 * because, while the inq_pid data member follows inq_vid in the
+	 * scsi_inquiry structure, we do not control the contents of this
+	 * buffer, and some broken devices violate SPC 4.3.1 and return
+	 * fields with null bytes in them.
+	 */
+	chk_vidlen = MIN(VSZ, idlen);
+	p = id + chk_vidlen - 1;
+	while (*p == ' ' && chk_vidlen > 0) {
+		--p;
+		--chk_vidlen;
+	}
+
+	/*
+	 * If it's all spaces, check the whole thing.
 	 */
-	if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) {
+	if (chk_vidlen == 0)
+		chk_vidlen = MIN(VSZ, idlen);
+
+	if (idlen > VSZ) {
+		chk_pidlen = idlen - VSZ;
+		p = id + idlen - 1;
+		while (*p == ' ' && chk_pidlen > 0) {
+			--p;
+			--chk_pidlen;
+		}
+		if (chk_pidlen == 0)
+			chk_pidlen = MIN(PSZ, idlen - VSZ);
+	}
+
+	/*
+	 * There's one more thing we need to do here.  If the user specified
+	 * an ID with trailing spaces, we need to make sure the inquiry
+	 * vid/pid has only spaces or NULs after the check length; otherwise, it
+	 * can't match.
+	 */
+	if (idlen > chk_vidlen && chk_vidlen < VSZ) {
+		for (p = sd_inq->inq_vid + chk_vidlen;
+		    p < sd_inq->inq_vid + VSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+	if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) {
+		for (p = sd_inq->inq_pid + chk_pidlen;
+		    p < sd_inq->inq_pid + PSZ; ++p) {
+			if (*p != ' ' && *p != '\0') {
+				++has_tail;
+				break;
+			}
+		}
+	}
+
+	if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 ||
+	    (idlen > VSZ &&
+	    strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) {
 		/*
 		 * The user id string is compared to the inquiry vid/pid
 		 * using a case insensitive comparison and ignoring
@@ -22318,6 +22382,7 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p)
 		case DKIOCINFO:
 		case DKIOCGMEDIAINFO:
 		case DKIOCGMEDIAINFOEXT:
+		case DKIOCSOLIDSTATE:
 		case MHIOCENFAILFAST:
 		case MHIOCSTATUS:
 		case MHIOCTKOWN:
@@ -22510,6 +22575,16 @@ skip_ready_valid:
 		}
 		break;
 
+	case DKIOCSOLIDSTATE:
+		SD_TRACE(SD_LOG_IOCTL, un, "DKIOCSOLIDSTATE\n");
+		i = un->un_f_is_solid_state ? 1 : 0;
+		if (ddi_copyout(&i, (void *)arg, sizeof (int), flag) != 0) {
+			err = EFAULT;
+		} else {
+			err = 0;
+		}
+		break;
+
 	case DKIOCHOTPLUGGABLE:
 		SD_TRACE(SD_LOG_IOCTL, un, "DKIOCHOTPLUGGABLE\n");
 		i = un->un_f_is_hotpluggable ? 1 : 0;
diff --git a/usr/src/uts/common/io/sdcard/impl/sda_mem.c b/usr/src/uts/common/io/sdcard/impl/sda_mem.c
index 752a3b8a32..1b485cac24 100644
--- a/usr/src/uts/common/io/sdcard/impl/sda_mem.c
+++ b/usr/src/uts/common/io/sdcard/impl/sda_mem.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -207,6 +208,7 @@ sda_mem_bd_mediainfo(void *arg, bd_media_t *media)
 	media->m_nblks = slot->s_nblks;
 	media->m_blksize = slot->s_blksz;
 	media->m_readonly = slot->s_flags & SLOTF_WRITABLE ? B_FALSE : B_TRUE;
+	media->m_solidstate = B_TRUE;
 	sda_slot_exit(slot);
 	return (0);
 }
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index c70ff2b22b..065d7f2cbc 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -51,6 +52,7 @@
 #include <sys/vlan.h>
 #include <sys/vnic.h>
 #include <sys/vnic_impl.h>
+#include <sys/mac_impl.h>
 #include <sys/mac_flow_impl.h>
 #include <inet/ip_impl.h>
 
@@ -81,6 +83,7 @@ static int vnic_m_stat(void *, uint_t, uint64_t *);
 static void vnic_m_ioctl(void *, queue_t *, mblk_t *);
 static int vnic_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
     const void *);
+static int vnic_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
 static void vnic_m_propinfo(void *, const char *, mac_prop_id_t,
     mac_prop_info_handle_t);
 static mblk_t *vnic_m_tx(void *, mblk_t *);
@@ -100,7 +103,7 @@ static mod_hash_t	*vnic_hash;
 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
 
 #define	VNIC_M_CALLBACK_FLAGS	\
-	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
+	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
 
 static mac_callbacks_t vnic_m_callbacks = {
 	VNIC_M_CALLBACK_FLAGS,
@@ -117,7 +120,7 @@ static mac_callbacks_t vnic_m_callbacks = {
 	NULL,
 	NULL,
 	vnic_m_setprop,
-	NULL,
+	vnic_m_getprop,
 	vnic_m_propinfo
 };
 
@@ -849,17 +852,19 @@ static int
 vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
     uint_t pr_valsize, const void *pr_val)
 {
-	int 		err = ENOTSUP;
+	int 		err = 0;
 	vnic_t		*vn = m_driver;
 
-	/* allow setting MTU only on an etherstub */
-	if (vn->vn_link_id != DATALINK_INVALID_LINKID)
-		return (err);
-
 	switch (pr_num) {
 	case MAC_PROP_MTU: {
 		uint32_t	mtu;
 
+		/* allow setting MTU only on an etherstub */
+		if (vn->vn_link_id != DATALINK_INVALID_LINKID) {
+			err = ENOTSUP;
+			break;
+		}
+
 		if (pr_valsize < sizeof (mtu)) {
 			err = EINVAL;
 			break;
@@ -872,12 +877,46 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
 		err = mac_maxsdu_update(vn->vn_mh, mtu);
 		break;
 	}
+	case MAC_PROP_VN_PROMISC_FILTERED: {
+		boolean_t filtered;
+
+		if (pr_valsize < sizeof (filtered)) {
+			err = EINVAL;
+			break;
+		}
+
+		bcopy(pr_val, &filtered, sizeof (filtered));
+		mac_set_promisc_filtered(vn->vn_mch, filtered);
+	}
 	default:
+		err = ENOTSUP;
 		break;
 	}
 	return (err);
 }
 
+static int
+vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+    uint_t pr_valsize, void *pr_val)
+{
+	vnic_t		*vn = arg;
+	int 		ret = 0;
+	boolean_t	out;
+
+	switch (pr_num) {
+	case MAC_PROP_VN_PROMISC_FILTERED:
+		out = mac_get_promisc_filtered(vn->vn_mch);
+		ASSERT(pr_valsize >= sizeof (boolean_t));
+		bcopy(&out, pr_val, sizeof (boolean_t));
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
 /* ARGSUSED */
 static void vnic_m_propinfo(void *m_driver, const char *pr_name,
     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c
index 0db01f80d7..c3d04e5508 100644
--- a/usr/src/uts/common/os/bio.c
+++ b/usr/src/uts/common/os/bio.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
 		cpup = CPU;	/* get pointer AFTER preemption is disabled */
 		CPU_STATS_ADDQ(cpup, vm, pgin, 1);
 		CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
+
+		atomic_add_64(&curzone->zone_pgpgin, btopr(len));
+
 		if ((flags & B_ASYNC) == 0) {
 			klwp_t *lwp = ttolwp(curthread);
 			if (lwp != NULL)
@@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
 		if (pp != NULL && pp->p_vnode != NULL) {
 			if (IS_SWAPFSVP(pp->p_vnode)) {
 				CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
+				atomic_add_64(&curzone->zone_anonpgin,
+				    btopr(len));
 			} else {
 				if (pp->p_vnode->v_flag & VVMEXEC) {
 					CPU_STATS_ADDQ(cpup, vm, execpgin,
 					    btopr(len));
+					atomic_add_64(&curzone->zone_execpgin,
+					    btopr(len));
 				} else {
 					CPU_STATS_ADDQ(cpup, vm, fspgin,
 					    btopr(len));
+					atomic_add_64(&curzone->zone_fspgin,
+					    btopr(len));
 				}
 			}
 		}
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 451c9db48c..3f4dd63c82 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -23,6 +23,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/param.h>
@@ -66,6 +67,7 @@
 #include <sys/ddi_timer.h>
 #include <sys/random.h>
 #include <sys/modctl.h>
+#include <sys/zone.h>
 
 /*
  * for NTP support
@@ -1158,6 +1160,10 @@ loadavg_update()
 
 	} while ((cpupart = cpupart->cp_next) != cp_list_head);
 
+	/*
+	 * Third pass totals up per-zone statistics.
+	 */
+	zone_loadavg_update();
 }
 
 /*
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index e097f355ec..7870617a26 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -24,7 +24,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ */
 
 #include <sys/timer.h>
 #include <sys/systm.h>
@@ -112,6 +114,25 @@ clock_highres_timer_settime(itimer_t *it, int flags,
 	cyctime.cyt_when = ts2hrt(&when->it_value);
 	cyctime.cyt_interval = ts2hrt(&when->it_interval);
 
+	if (cyctime.cyt_when != 0 && cyctime.cyt_interval == 0 &&
+	    it->it_itime.it_interval.tv_sec == 0 &&
+	    it->it_itime.it_interval.tv_nsec == 0 &&
+	    (cyc = *cycp) != CYCLIC_NONE) {
+		/*
+		 * If our existing timer is a one-shot and our new timer is a
+		 * one-shot, we'll save ourselves a world of grief and just
+		 * reprogram the cyclic.
+		 */
+		it->it_itime = *when;
+
+		if (!(flags & TIMER_ABSTIME))
+			cyctime.cyt_when += gethrtime();
+
+		hrt2ts(cyctime.cyt_when, &it->it_itime.it_value);
+		(void) cyclic_reprogram(cyc, cyctime.cyt_when);
+		return (0);
+	}
+
 	mutex_enter(&cpu_lock);
 	if ((cyc = *cycp) != CYCLIC_NONE) {
 		cyclic_remove(cyc);
@@ -162,17 +183,14 @@ clock_highres_timer_settime(itimer_t *it, int flags,
 
 	if (cyctime.cyt_interval == 0) {
 		/*
-		 * If this is a one-shot, then we set the interval to assure
-		 * that the cyclic will next fire INT64_MAX nanoseconds after
-		 * boot (which corresponds to over 292 years -- yes, Buck Rogers
-		 * may have his 292-year-uptime-Solaris box malfunction).  If
-		 * this timer is never touched, this cyclic will simply
-		 * consume space in the cyclic subsystem.  As soon as
+		 * If this is a one-shot, then we set the interval to be
+		 * inifinite.  If this timer is never touched, this cyclic will
+		 * simply consume space in the cyclic subsystem.  As soon as
 		 * timer_settime() or timer_delete() is called, the cyclic is
 		 * removed (so it's not possible to run the machine out
 		 * of resources by creating one-shots).
 		 */
-		cyctime.cyt_interval = INT64_MAX - cyctime.cyt_when;
+		cyctime.cyt_interval = CY_INFINITY;
 	}
 
 	it->it_itime = *when;
@@ -185,8 +203,6 @@ clock_highres_timer_settime(itimer_t *it, int flags,
 
 	if (cyctime.cyt_when != 0)
 		*cycp = cyc = cyclic_add(&hdlr, &cyctime);
-	else
-		*cycp = cyc = CYCLIC_NONE;
 
 	/*
 	 * Now that we have the cyclic created, we need to bind it to our
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index a292f4e14f..ebaa6bfe41 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -497,7 +497,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit)
 	contract_t *parent = &p->p_ct_process->conp_contract;
 	int inherit = 0;
 
-	ASSERT(p == curproc);
+	VERIFY(p == curproc);
 
 	mutex_enter(&ct->ct_lock);
 
@@ -547,7 +547,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit)
 
 	if (inherit) {
 		ct->ct_state = CTS_INHERITED;
-		ASSERT(ct->ct_regent == parent);
+		VERIFY(ct->ct_regent == parent);
 		contract_process_take(parent, ct);
 
 		/*
@@ -2063,8 +2063,8 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
 {
 	ct_kevent_t *e, *first = NULL;
 
-	ASSERT(q->ctq_listno == CTEL_CONTRACT);
-	ASSERT(newq->ctq_listno == CTEL_PBUNDLE);
+	VERIFY(q->ctq_listno == CTEL_CONTRACT);
+	VERIFY(newq->ctq_listno == CTEL_PBUNDLE);
 
 	mutex_enter(&q->ctq_lock);
 	mutex_enter(&newq->ctq_lock);
@@ -2077,8 +2077,16 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
 		if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
 			if (first == NULL)
 				first = e;
-			list_insert_tail(&newq->ctq_events, e);
-			cte_hold(e);
+			/*
+			 * It is possible for adoption to race with an owner's
+			 * cte_publish_all(); we must only enqueue events that
+			 * have not already been enqueued.
+			 */
+			if (!list_link_active((list_node_t *)
+			    ((uintptr_t)e + newq->ctq_events.list_offset))) {
+				list_insert_tail(&newq->ctq_events, e);
+				cte_hold(e);
+			}
 		}
 	}
 
@@ -2117,7 +2125,7 @@ cte_trim(ct_equeue_t *q, contract_t *ct)
 	int flags, stopper;
 	int start = 1;
 
-	ASSERT(MUTEX_HELD(&q->ctq_lock));
+	VERIFY(MUTEX_HELD(&q->ctq_lock));
 
 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
 		next = list_next(&q->ctq_events, e);
@@ -2227,13 +2235,24 @@ cte_queue_drain(ct_equeue_t *q, int ack)
  * cte_publish_all.
  */
 static void
-cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
+cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist)
 {
 	ASSERT(MUTEX_HELD(&q->ctq_lock));
 
 	q->ctq_atime = *tsp;
 
 	/*
+	 * If this event may already exist on this queue, check to see if it
+	 * is already there and return if so.
+	 */
+	if (mayexist && list_link_active((list_node_t *)((uintptr_t)e +
+	    q->ctq_events.list_offset))) {
+		mutex_exit(&q->ctq_lock);
+		cte_rele(e);
+		return;
+	}
+
+	/*
 	 * Don't publish if the event is informative and there aren't
 	 * any listeners, or if the queue has been shut down.
 	 */
@@ -2247,6 +2266,8 @@ cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
 	/*
 	 * Enqueue event
 	 */
+	VERIFY(!list_link_active((list_node_t *)
+	    ((uintptr_t)e + q->ctq_events.list_offset)));
 	list_insert_tail(&q->ctq_events, e);
 
 	/*
@@ -2318,14 +2339,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
 			ct->ct_evcnt++;
 	}
 	mutex_exit(&ct->ct_lock);
-	cte_publish(&ct->ct_events, e, &ts);
+	cte_publish(&ct->ct_events, e, &ts, B_FALSE);
 
 	/*
 	 * CTEL_BUNDLE - Next deliver to the contract type's bundle
 	 * queue.
 	 */
 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
-	cte_publish(&ct->ct_type->ct_type_events, e, &ts);
+	cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE);
 
 	/*
 	 * CTEL_PBUNDLE - Finally, if the contract has an owner,
@@ -2342,7 +2363,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
 		q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
 		mutex_enter(&q->ctq_lock);
 		mutex_exit(&ct->ct_lock);
-		cte_publish(q, e, &ts);
+
+		/*
+		 * It is possible for this code to race with adoption; we
+		 * publish the event indicating that the event may already
+		 * be enqueued because adoption beat us to it (in which case
+		 * cte_pubish() does nothing).
+		 */
+		cte_publish(q, e, &ts, B_TRUE);
 	} else {
 		mutex_exit(&ct->ct_lock);
 		cte_rele(e);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index 9e04f631a9..3b3935a772 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -534,6 +535,10 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr)
 		case 'z':
 			s = p->p_zone->zone_name;
 			break;
+		case 'Z':
+			/* This is zonepath + "/root/", except for GZ */
+			s = p->p_zone->zone_rootpath;
+			break;
 		case '%':
 			(void) strcpy((s = buf), "%");
 			break;
@@ -548,6 +553,9 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr)
 		if ((size -= len) <= 0)
 			return (ENAMETOOLONG);
 		(void) strcpy(fp, s);
+		/* strip trailing "/root/" from non-GZ zonepath string */
+		if (c == 'Z' && len > 6)
+			len -= 6;
 		fp += len;
 	}
 
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 1ec63249ab..20e57efaad 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -724,6 +724,14 @@ crgetzoneid(const cred_t *cr)
 	    cr->cr_zone->zone_id);
 }
 
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+	return (cr->cr_zone == NULL ?
+	    (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+	    cr->cr_zone->zone_did);
+}
+
 projid_t
 crgetprojid(const cred_t *cr)
 {
diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c
index 1bb6baf445..93a318d260 100644
--- a/usr/src/uts/common/os/cyclic.c
+++ b/usr/src/uts/common/os/cyclic.c
@@ -24,6 +24,10 @@
  */
 
 /*
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ */
+
+/*
  *  The Cyclic Subsystem
  *  --------------------
  *
@@ -1139,7 +1143,7 @@ top:
 	CYC_TRACE(cpu, level, "softint-top", cyclics, pc);
 
 	while (consndx != pc->cypc_prodndx) {
-		int pend, npend, opend;
+		uint32_t pend, npend, opend;
 		int consmasked = consndx & sizemask;
 		cyclic_t *cyclic = &cyclics[buf[consmasked]];
 		cyc_func_t handler = cyclic->cy_handler;
diff --git a/usr/src/uts/common/os/dtrace_subr.c b/usr/src/uts/common/os/dtrace_subr.c
index f2a9ac1b7d..d2ce3361c1 100644
--- a/usr/src/uts/common/os/dtrace_subr.c
+++ b/usr/src/uts/common/os/dtrace_subr.c
@@ -44,6 +44,7 @@ void (*dtrace_helpers_fork)(proc_t *, proc_t *);
 void (*dtrace_cpustart_init)(void);
 void (*dtrace_cpustart_fini)(void);
 void (*dtrace_cpc_fire)(uint64_t);
+void (*dtrace_closef)(void);
 
 void (*dtrace_debugger_init)(void);
 void (*dtrace_debugger_fini)(void);
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index b97a09454b..7c5b8323e3 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -388,10 +389,16 @@ proc_exit(int why, int what)
 	if (p->p_pid == z->zone_proc_initpid) {
 		if (z->zone_boot_err == 0 &&
 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
-		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
-		    z->zone_restart_init == B_TRUE &&
-		    restart_init(what, why) == 0)
-			return (0);
+		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
+			if (z->zone_restart_init == B_TRUE) {
+				if (restart_init(what, why) == 0)
+					return (0);
+			} else {
+				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
+				    CRED());
+			}
+		}
+
 		/*
 		 * Since we didn't or couldn't restart init, we clear
 		 * the zone's init state and proceed with exit
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index a014d25c0f..3b47e05ef2 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -54,6 +55,7 @@
 #include <sys/poll.h>
 #include <sys/rctl.h>
 #include <sys/port_impl.h>
+#include <sys/dtrace.h>
 
 #include <c2/audit.h>
 #include <sys/nbmlock.h>
@@ -952,6 +954,18 @@ closef(file_t *fp)
 	ASSERT(fp->f_count == 0);
 	mutex_exit(&fp->f_tlock);
 
+	/*
+	 * If DTrace has getf() subroutines active, it will set dtrace_closef
+	 * to point to code that implements a barrier with respect to probe
+	 * context.  This must be called before the file_t is freed (and the
+	 * vnode that it refers to is released) -- but it must be after the
+	 * file_t has been removed from the uf_entry_t.  That is, there must
+	 * be no way for a racing getf() in probe context to yield the fp that
+	 * we're operating upon.
+	 */
+	if (dtrace_closef != NULL)
+		(*dtrace_closef)();
+
 	VN_RELE(vp);
 	/*
 	 * deallocate resources to audit_data
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 83b817e866..a5f5a6f3c2 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -160,6 +161,7 @@ struct {
 	kstat_named_t avenrun_5min;
 	kstat_named_t avenrun_15min;
 	kstat_named_t boot_time;
+	kstat_named_t nsec_per_tick;
 } system_misc_kstat = {
 	{ "ncpus",		KSTAT_DATA_UINT32 },
 	{ "lbolt",		KSTAT_DATA_UINT32 },
@@ -171,6 +173,7 @@ struct {
 	{ "avenrun_5min",	KSTAT_DATA_UINT32 },
 	{ "avenrun_15min",	KSTAT_DATA_UINT32 },
 	{ "boot_time",		KSTAT_DATA_UINT32 },
+	{ "nsec_per_tick",	KSTAT_DATA_UINT32 },
 };
 
 struct {
@@ -803,7 +806,6 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
 {
 	int myncpus = ncpus;
 	int *loadavgp = &avenrun[0];
-	int loadavg[LOADAVG_NSTATS];
 	time_t zone_boot_time;
 	clock_t zone_lbolt;
 	hrtime_t zone_hrtime;
@@ -820,17 +822,11 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
 		 */
 		mutex_enter(&cpu_lock);
 		if (pool_pset_enabled()) {
-			psetid_t mypsid = zone_pset_get(curproc->p_zone);
-			int error;
-
 			myncpus = zone_ncpus_get(curproc->p_zone);
 			ASSERT(myncpus > 0);
-			error = cpupart_get_loadavg(mypsid, &loadavg[0],
-			    LOADAVG_NSTATS);
-			ASSERT(error == 0);
-			loadavgp = &loadavg[0];
 		}
 		mutex_exit(&cpu_lock);
+		loadavgp = &curproc->p_zone->zone_avenrun[0];
 	}
 
 	if (INGLOBALZONE(curproc)) {
@@ -838,9 +834,7 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
 		zone_lbolt = ddi_get_lbolt();
 		zone_nproc = nproc;
 	} else {
-		struct timeval tvp;
-		hrt2tv(curproc->p_zone->zone_zsched->p_mstart, &tvp);
-		zone_boot_time = tvp.tv_sec;
+		zone_boot_time = curproc->p_zone->zone_boot_time;
 
 		zone_hrtime = gethrtime();
 		zone_lbolt = (clock_t)(NSEC_TO_TICK(zone_hrtime) -
@@ -861,6 +855,8 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
 	system_misc_kstat.avenrun_15min.value.ui32	= (uint32_t)loadavgp[2];
 	system_misc_kstat.boot_time.value.ui32		= (uint32_t)
 	    zone_boot_time;
+	system_misc_kstat.nsec_per_tick.value.ui32	= (uint32_t)
+	    nsec_per_tick;
 	return (0);
 }
 
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index f5cebbf82e..63a89a2ce8 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -248,8 +248,7 @@ log_init(void)
 	 */
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
-	    "All rights reserved.\n");
+	printf("Copyright (c) 2010-2012, Joyent Inc. All rights reserved.\n");
 #ifdef DEBUG
 	printf("DEBUG enabled\n");
 #endif
diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c
index df975eb7ee..30e50cce72 100644
--- a/usr/src/uts/common/os/msacct.c
+++ b/usr/src/uts/common/os/msacct.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -33,6 +34,7 @@
 #include <sys/debug.h>
 #include <sys/msacct.h>
 #include <sys/time.h>
+#include <sys/zone.h>
 
 /*
  * Mega-theory block comment:
@@ -390,6 +392,7 @@ void
 syscall_mstate(int fromms, int toms)
 {
 	kthread_t *t = curthread;
+	zone_t *z = ttozone(t);
 	struct mstate *ms;
 	hrtime_t *mstimep;
 	hrtime_t curtime;
@@ -413,6 +416,10 @@ syscall_mstate(int fromms, int toms)
 		newtime = curtime - ms->ms_state_start;
 	}
 	*mstimep += newtime;
+	if (fromms == LMS_USER)
+		atomic_add_64(&z->zone_utime, newtime);
+	else if (fromms == LMS_SYSTEM)
+		atomic_add_64(&z->zone_stime, newtime);
 	t->t_mstate = toms;
 	ms->ms_state_start = curtime;
 	ms->ms_prev = fromms;
@@ -560,27 +567,18 @@ cpu_update_pct(kthread_t *t, hrtime_t newtime)
 	 */
 
 	do {
-		if (T_ONPROC(t) && t->t_waitrq == 0) {
-			hrlb = t->t_hrtime;
+		pctcpu = t->t_pctcpu;
+		hrlb = t->t_hrtime;
+		delta = newtime - hrlb;
+		if (delta < 0) {
+			newtime = gethrtime_unscaled();
 			delta = newtime - hrlb;
-			if (delta < 0) {
-				newtime = gethrtime_unscaled();
-				delta = newtime - hrlb;
-			}
-			t->t_hrtime = newtime;
-			scalehrtime(&delta);
-			pctcpu = t->t_pctcpu;
+		}
+		t->t_hrtime = newtime;
+		scalehrtime(&delta);
+		if (T_ONPROC(t) && t->t_waitrq == 0) {
 			npctcpu = cpu_grow(pctcpu, delta);
 		} else {
-			hrlb = t->t_hrtime;
-			delta = newtime - hrlb;
-			if (delta < 0) {
-				newtime = gethrtime_unscaled();
-				delta = newtime - hrlb;
-			}
-			t->t_hrtime = newtime;
-			scalehrtime(&delta);
-			pctcpu = t->t_pctcpu;
 			npctcpu = cpu_decay(pctcpu, delta);
 		}
 	} while (cas32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu);
@@ -602,7 +600,10 @@ new_mstate(kthread_t *t, int new_state)
 	hrtime_t curtime;
 	hrtime_t newtime;
 	hrtime_t oldtime;
+	hrtime_t ztime;
+	hrtime_t origstart;
 	klwp_t *lwp;
+	zone_t *z;
 
 	ASSERT(new_state != LMS_WAIT_CPU);
 	ASSERT((unsigned)new_state < NMSTATES);
@@ -625,6 +626,7 @@ new_mstate(kthread_t *t, int new_state)
 
 	ms = &lwp->lwp_mstate;
 	state = t->t_mstate;
+	origstart = ms->ms_state_start;
 	do {
 		switch (state) {
 		case LMS_TFAULT:
@@ -637,7 +639,7 @@ new_mstate(kthread_t *t, int new_state)
 			mstimep = &ms->ms_acct[state];
 			break;
 		}
-		newtime = curtime - ms->ms_state_start;
+		ztime = newtime = curtime - ms->ms_state_start;
 		if (newtime < 0) {
 			curtime = gethrtime_unscaled();
 			oldtime = *mstimep - 1; /* force CAS to fail */
@@ -648,6 +650,20 @@ new_mstate(kthread_t *t, int new_state)
 		t->t_mstate = new_state;
 		ms->ms_state_start = curtime;
 	} while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime);
+
+	/*
+	 * When the system boots the initial startup thread will have a
+	 * ms_state_start of 0 which would add a huge system time to the global
+	 * zone.  We want to skip aggregating that initial bit of work.
+	 */
+	if (origstart != 0) {
+		z = ttozone(t);
+		if (state == LMS_USER)
+			atomic_add_64(&z->zone_utime, ztime);
+		else if (state == LMS_SYSTEM)
+			atomic_add_64(&z->zone_stime, ztime);
+	}
+
 	/*
 	 * Remember the previous running microstate.
 	 */
@@ -686,6 +702,8 @@ restore_mstate(kthread_t *t)
 	hrtime_t waitrq;
 	hrtime_t newtime;
 	hrtime_t oldtime;
+	hrtime_t waittime;
+	zone_t *z;
 
 	/*
 	 * Don't call restore mstate of threads without lwps.  (Kernel threads)
@@ -756,11 +774,15 @@ restore_mstate(kthread_t *t)
 		oldtime = *mstimep;
 		newtime += oldtime;
 	} while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime);
+
 	/*
 	 * Update the WAIT_CPU timer and per-cpu waitrq total.
 	 */
-	ms->ms_acct[LMS_WAIT_CPU] += (curtime - waitrq);
-	CPU->cpu_waitrq += (curtime - waitrq);
+	z = ttozone(t);
+	waittime = curtime - waitrq;
+	ms->ms_acct[LMS_WAIT_CPU] += waittime;
+	atomic_add_64(&z->zone_wtime, waittime);
+	CPU->cpu_waitrq += waittime;
 	ms->ms_state_start = curtime;
 }
 
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 573ebbc367..d8f7882723 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -2563,3 +2564,12 @@ secpolicy_ppp_config(const cred_t *cr)
 		return (secpolicy_net_config(cr, B_FALSE));
 	return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
 }
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+	if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+		return (EPERM);
+	return (0);
+}
+
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index a5a918b326..53617bd0fe 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -176,6 +176,10 @@ privilege PRIV_GRAPHICS_MAP
 	Allows a process to perform privileged mappings through a
 	graphics device.
 
+privilege PRIV_HYPRLOFS_CONTROL
+
+	Allows a process to manage hyprlofs entries.
+
 privilege PRIV_IPC_DAC_READ
 
 	Allows a process to read a System V IPC
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index 6946a35a38..1b222538b3 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1619,7 +1619,7 @@ vmem_destroy(vmem_t *vmp)
 
 	leaked = vmem_size(vmp, VMEM_ALLOC);
 	if (leaked != 0)
-		cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+		cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
 		    vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
 		    "identifiers" : "bytes");
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 79ccd94ae4..f308b45260 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved.
  */
 
 /*
@@ -369,21 +370,18 @@ static char *zone_ref_subsys_names[] = {
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;
 rctl_hndl_t rc_zone_shmmax;
 rctl_hndl_t rc_zone_shmmni;
 rctl_hndl_t rc_zone_semmni;
 rctl_hndl_t rc_zone_msgmni;
-/*
- * Synchronization primitives used to synchronize between mounts and zone
- * creation/destruction.
- */
-static int mounts_in_progress;
-static kcondvar_t mount_cv;
-static kmutex_t mount_lock;
 
 const char * const zone_default_initname = "/sbin/init";
 static char * const zone_prefix = "/zone/";
@@ -423,23 +421,27 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
  * Version 5 alters the zone_boot system call, and converts its old
  *     bootargs parameter to be set by the zone_setattr API instead.
  * Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
  */
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
 
 /*
  * Certain filesystems (such as NFS and autofs) need to know which zone
  * the mount is being placed in.  Because of this, we need to be able to
- * ensure that a zone isn't in the process of being created such that
- * nfs_mount() thinks it is in the global zone, while by the time it
- * gets added the list of mounted zones, it ends up on zoneA's mount
- * list.
+ * ensure that a zone isn't in the process of being created/destroyed such
+ * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
+ * it gets added the list of mounted zones, it ends up on the wrong zone's
+ * mount list. Since a zone can't reside on an NFS file system, we don't
+ * have to worry about the zonepath itself.
  *
  * The following functions: block_mounts()/resume_mounts() and
  * mount_in_progress()/mount_completed() are used by zones and the VFS
- * layer (respectively) to synchronize zone creation and new mounts.
+ * layer (respectively) to synchronize zone state transitions and new
+ * mounts within a zone. This syncronization is on a per-zone basis, so
+ * activity for one zone will not interfere with activity for another zone.
  *
  * The semantics are like a reader-reader lock such that there may
- * either be multiple mounts (or zone creations, if that weren't
+ * either be multiple mounts (or zone state transitions, if that weren't
  * serialized by zonehash_lock) in progress at the same time, but not
  * both.
  *
@@ -447,10 +449,8 @@ static const int ZONE_SYSCALL_API_VERSION = 6;
  * taking too long.
  *
  * The semantics are such that there is unfair bias towards the
- * "current" operation.  This means that zone creations may starve if
- * there is a rapid succession of new mounts coming in to the system, or
- * there is a remote possibility that zones will be created at such a
- * rate that new mounts will not be able to proceed.
+ * "current" operation.  This means that zone halt may starve if
+ * there is a rapid succession of new mounts coming in to the zone.
  */
 /*
  * Prevent new mounts from progressing to the point of calling
@@ -458,7 +458,7 @@ static const int ZONE_SYSCALL_API_VERSION = 6;
  * them to complete.
  */
 static int
-block_mounts(void)
+block_mounts(zone_t *zp)
 {
 	int retval = 0;
 
@@ -467,19 +467,21 @@ block_mounts(void)
 	 * called with zonehash_lock held.
 	 */
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
-	mutex_enter(&mount_lock);
-	while (mounts_in_progress > 0) {
-		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
+	mutex_enter(&zp->zone_mount_lock);
+	while (zp->zone_mounts_in_progress > 0) {
+		if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 			goto signaled;
 	}
 	/*
 	 * A negative value of mounts_in_progress indicates that mounts
-	 * have been blocked by (-mounts_in_progress) different callers.
+	 * have been blocked by (-mounts_in_progress) different callers
+	 * (remotely possible if two threads enter zone_shutdown at the same
+	 * time).
 	 */
-	mounts_in_progress--;
+	zp->zone_mounts_in_progress--;
 	retval = 1;
 signaled:
-	mutex_exit(&mount_lock);
+	mutex_exit(&zp->zone_mount_lock);
 	return (retval);
 }
 
@@ -488,26 +490,26 @@ signaled:
  * Allow them to progress if we were the last obstacle.
  */
 static void
-resume_mounts(void)
+resume_mounts(zone_t *zp)
 {
-	mutex_enter(&mount_lock);
-	if (++mounts_in_progress == 0)
-		cv_broadcast(&mount_cv);
-	mutex_exit(&mount_lock);
+	mutex_enter(&zp->zone_mount_lock);
+	if (++zp->zone_mounts_in_progress == 0)
+		cv_broadcast(&zp->zone_mount_cv);
+	mutex_exit(&zp->zone_mount_lock);
 }
 
 /*
- * The VFS layer is busy with a mount; zones should wait until all
- * mounts are completed to progress.
+ * The VFS layer is busy with a mount; this zone should wait until all
+ * of its mounts are completed to progress.
  */
 void
-mount_in_progress(void)
+mount_in_progress(zone_t *zp)
 {
-	mutex_enter(&mount_lock);
-	while (mounts_in_progress < 0)
-		cv_wait(&mount_cv, &mount_lock);
-	mounts_in_progress++;
-	mutex_exit(&mount_lock);
+	mutex_enter(&zp->zone_mount_lock);
+	while (zp->zone_mounts_in_progress < 0)
+		cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
+	zp->zone_mounts_in_progress++;
+	mutex_exit(&zp->zone_mount_lock);
 }
 
 /*
@@ -515,12 +517,12 @@ mount_in_progress(void)
  * callers if this is the last mount.
  */
 void
-mount_completed(void)
+mount_completed(zone_t *zp)
 {
-	mutex_enter(&mount_lock);
-	if (--mounts_in_progress == 0)
-		cv_broadcast(&mount_cv);
-	mutex_exit(&mount_lock);
+	mutex_enter(&zp->zone_mount_lock);
+	if (--zp->zone_mounts_in_progress == 0)
+		cv_broadcast(&zp->zone_mount_cv);
+	mutex_exit(&zp->zone_mount_lock);
 }
 
 /*
@@ -1380,6 +1382,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+	rcop_no_action,
+	zone_cpu_base_get,
+	zone_cpu_base_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+	rcop_no_action,
+	zone_cpu_burst_time_get,
+	zone_cpu_burst_time_set,
+	rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set priority to the new value.
+	 */
+	zone->zone_zfs_io_pri = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+	rcop_no_action,
+	zone_zfs_io_pri_get,
+	zone_zfs_io_pri_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
 {
 	rctl_qty_t nlwps;
@@ -1674,6 +1784,39 @@ static rctl_ops_t zone_max_swap_ops = {
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+	rctl_qty_t q;
+	zone_t *z = p->p_zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	/* No additional lock because not enforced in the kernel */
+	q = z->zone_phys_mem;
+	return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+	if (e->rcep_p.zone == NULL)
+		return (0);
+	e->rcep_p.zone->zone_phys_mem_ctl = nv;
+	return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+	rcop_no_action,
+	zone_phys_mem_usage,
+	zone_phys_mem_set,
+	rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
 	rctl_qty_t q;
@@ -1767,6 +1910,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_kstat_t *zk = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+	zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+	return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
 	zone_t *zone = ksp->ks_private;
@@ -1795,7 +1952,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
 }
 
 static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
     int (*updatefunc) (kstat_t *, int))
 {
 	kstat_t *ksp;
@@ -1820,26 +1977,337 @@ zone_kstat_create_common(zone_t *zone, char *name,
 	return (ksp);
 }
 
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_vfs_kstat_t *zvp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the VFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the slow ops
+	 * counters are updated directly by the VFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zvp->zv_nread.value.ui64 = kiop->nread;
+	zvp->zv_reads.value.ui64 = kiop->reads;
+	zvp->zv_rtime.value.ui64 = kiop->rtime;
+	zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+	zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+	zvp->zv_writes.value.ui64 = kiop->writes;
+	zvp->zv_wtime.value.ui64 = kiop->wtime;
+	zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+	scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+	scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_vfs_kstat_t *zvp;
+
+	if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+	    zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_vfs_lock;
+	zone->zone_vfs_stats = zvp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+	kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+	kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_vfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_zfs_kstat_t *zzp = ksp->ks_data;
+	kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	/*
+	 * Extract the ZFS statistics from the kstat_io_t structure used by
+	 * kstat_runq_enter() and related functions.  Since the I/O throttle
+	 * counters are updated directly by the ZFS layer, there's no need to
+	 * copy those statistics here.
+	 *
+	 * Note that kstat_runq_enter() and the related functions use
+	 * gethrtime_unscaled(), so scale the time here.
+	 */
+	zzp->zz_nread.value.ui64 = kiop->nread;
+	zzp->zz_reads.value.ui64 = kiop->reads;
+	zzp->zz_rtime.value.ui64 = kiop->rtime;
+	zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+	zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+	zzp->zz_writes.value.ui64 = kiop->writes;
+
+	scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+	scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+	return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_zfs_kstat_t *zzp;
+
+	if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+	    zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+	    sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_zfs_lock;
+	zone->zone_zfs_stats = zzp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+	kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+	kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_zfs_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_mcap_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_mcap_kstat_t *zmp = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+	zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+	zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+	zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+	zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+	zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
+	zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
+	zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
+	zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
+	zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
+	zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+	zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+	zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
+
+	return (0);
+}
+
+static kstat_t *
+zone_mcap_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_mcap_kstat_t *zmp;
+
+	if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
+	    zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
+	    sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_mcap_lock;
+	zone->zone_mcap_stats = zmp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+	    KSTAT_DATA_UINT64);
+
+	ksp->ks_update = zone_mcap_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
+static int
+zone_misc_kstat_update(kstat_t *ksp, int rw)
+{
+	zone_t *zone = ksp->ks_private;
+	zone_misc_kstat_t *zmp = ksp->ks_data;
+	hrtime_t tmp;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	tmp = zone->zone_utime;
+	scalehrtime(&tmp);
+	zmp->zm_utime.value.ui64 = tmp;
+	tmp = zone->zone_stime;
+	scalehrtime(&tmp);
+	zmp->zm_stime.value.ui64 = tmp;
+	tmp = zone->zone_wtime;
+	scalehrtime(&tmp);
+	zmp->zm_wtime.value.ui64 = tmp;
+
+	zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
+	zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
+	zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
+
+	return (0);
+}
+
+static kstat_t *
+zone_misc_kstat_create(zone_t *zone)
+{
+	kstat_t *ksp;
+	zone_misc_kstat_t *zmp;
+
+	if ((ksp = kstat_create_zone("zones", zone->zone_id,
+	    zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
+	    sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+		return (NULL);
+
+	if (zone->zone_id != GLOBAL_ZONEID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+	zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
+	ksp->ks_data_size += strlen(zone->zone_name) + 1;
+	ksp->ks_lock = &zone->zone_misc_lock;
+	zone->zone_misc_stats = zmp;
+
+	/* The kstat "name" field is not large enough for a full zonename */
+	kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+	kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+	kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
+	kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
+	kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
+	    KSTAT_DATA_UINT32);
+
+	ksp->ks_update = zone_misc_kstat_update;
+	ksp->ks_private = zone;
+
+	kstat_install(ksp);
+	return (ksp);
+}
+
 static void
 zone_kstat_create(zone_t *zone)
 {
-	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+	zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
 	    "lockedmem", zone_lockedmem_kstat_update);
-	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+	zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
 	    "swapresv", zone_swapresv_kstat_update);
-	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+	zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+	    "physicalmem", zone_physmem_kstat_update);
+	zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
 	    "nprocs", zone_nprocs_kstat_update);
+
+	if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+		zone->zone_vfs_stats = kmem_zalloc(
+		    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+		zone->zone_zfs_stats = kmem_zalloc(
+		    sizeof (zone_zfs_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
+		zone->zone_mcap_stats = kmem_zalloc(
+		    sizeof (zone_mcap_kstat_t), KM_SLEEP);
+	}
+
+	if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
+		zone->zone_misc_stats = kmem_zalloc(
+		    sizeof (zone_misc_kstat_t), KM_SLEEP);
+	}
 }
 
 static void
-zone_kstat_delete_common(kstat_t **pkstat)
+zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
 {
 	void *data;
 
 	if (*pkstat != NULL) {
 		data = (*pkstat)->ks_data;
 		kstat_delete(*pkstat);
-		kmem_free(data, sizeof (zone_kstat_t));
+		kmem_free(data, datasz);
 		*pkstat = NULL;
 	}
 }
@@ -1847,9 +2315,23 @@ zone_kstat_delete_common(kstat_t **pkstat)
 static void
 zone_kstat_delete(zone_t *zone)
 {
-	zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
-	zone_kstat_delete_common(&zone->zone_swapresv_kstat);
-	zone_kstat_delete_common(&zone->zone_nprocs_kstat);
+	zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
+	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_swapresv_kstat,
+	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_physmem_kstat,
+	    sizeof (zone_kstat_t));
+	zone_kstat_delete_common(&zone->zone_nprocs_kstat,
+	    sizeof (zone_kstat_t));
+
+	zone_kstat_delete_common(&zone->zone_vfs_ksp,
+	    sizeof (zone_vfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_zfs_ksp,
+	    sizeof (zone_zfs_kstat_t));
+	zone_kstat_delete_common(&zone->zone_mcap_ksp,
+	    sizeof (zone_mcap_kstat_t));
+	zone_kstat_delete_common(&zone->zone_misc_ksp,
+	    sizeof (zone_misc_kstat_t));
 }
 
 /*
@@ -1883,6 +2365,8 @@ zone_zsd_init(void)
 	zone0.zone_locked_mem_ctl = UINT64_MAX;
 	ASSERT(zone0.zone_max_swap == 0);
 	zone0.zone_max_swap_ctl = UINT64_MAX;
+	zone0.zone_phys_mem = 0;
+	zone0.zone_phys_mem_ctl = UINT64_MAX;
 	zone0.zone_max_lofi = 0;
 	zone0.zone_max_lofi_ctl = UINT64_MAX;
 	zone0.zone_shmmax = 0;
@@ -1906,7 +2390,13 @@ zone_zsd_init(void)
 	zone0.zone_initname = initname;
 	zone0.zone_lockedmem_kstat = NULL;
 	zone0.zone_swapresv_kstat = NULL;
+	zone0.zone_physmem_kstat = NULL;
 	zone0.zone_nprocs_kstat = NULL;
+	zone0.zone_zfs_io_pri = 1;
+	zone0.zone_stime = 0;
+	zone0.zone_utime = 0;
+	zone0.zone_wtime = 0;
+
 	list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
 	    offsetof(zone_ref_t, zref_linkage));
 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2013,6 +2503,21 @@ zone_init(void)
 	    RCTL_GLOBAL_INFINITE,
 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
+	rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+	rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+	rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+	    1024, 1024, &zone_zfs_io_pri_ops);
+
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2054,6 +2559,20 @@ zone_init(void)
 	rde = rctl_dict_lookup("zone.cpu-shares");
 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
 
+	/*
+	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
+	 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+	 */
+	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+	bzero(dval, sizeof (rctl_val_t));
+	dval->rcv_value = 1;
+	dval->rcv_privilege = RCPRIV_PRIVILEGED;
+	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+	dval->rcv_action_recip_pid = -1;
+
+	rde = rctl_dict_lookup("zone.zfs-io-priority");
+	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2064,6 +2583,11 @@ zone_init(void)
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
 	    &zone_max_swap_ops);
 
+	rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+	    &zone_phys_mem_ops);
+
 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2375,14 +2899,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
 	return (0);
 }
 
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats.  Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats.  We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
 {
-	uint64_t mcap;
-	int err = 0;
+	zone->zone_mcap_nover++;
+
+	return (0);
+}
+
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+	uint64_t pageout;
+	int err;
 
-	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-		zone->zone_phys_mcap = mcap;
+	if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+		zone->zone_mcap_pagedout += pageout;
+
+	return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults.  This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+	uint32_t dusec;
+	int err;
+
+	if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+		zone->zone_pg_flt_delay = dusec;
+
+	return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+	uint64_t rss;
+	int err;
+
+	if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+		zone->zone_phys_mem = rss;
 
 	return (err);
 }
@@ -2794,6 +3369,12 @@ getzoneid(void)
 	return (curproc->p_zone->zone_id);
 }
 
+zoneid_t
+getzonedid(void)
+{
+	return (curproc->p_zone->zone_did);
+}
+
 /*
  * Internal versions of zone_find_by_*().  These don't zone_hold() or
  * check the validity of a zone's state.
@@ -2977,6 +3558,92 @@ zone_find_by_path(const char *path)
 }
 
 /*
+ * Public interface for updating per-zone load averages.  Called once per
+ * second.
+ *
+ * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
+ */
+void
+zone_loadavg_update()
+{
+	zone_t *zp;
+	zone_status_t status;
+	struct loadavg_s *lavg;
+	hrtime_t zone_total;
+	int i;
+	hrtime_t hr_avg;
+	int nrun;
+	static int64_t f[3] = { 135, 27, 9 };
+	int64_t q, r;
+
+	mutex_enter(&zonehash_lock);
+	for (zp = list_head(&zone_active); zp != NULL;
+	    zp = list_next(&zone_active, zp)) {
+		mutex_enter(&zp->zone_lock);
+
+		/* Skip zones that are on the way down or not yet up */
+		status = zone_status_get(zp);
+		if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
+			/* For all practical purposes the zone doesn't exist. */
+			mutex_exit(&zp->zone_lock);
+			continue;
+		}
+
+		/*
+		 * Update the 10 second moving average data in zone_loadavg.
+		 */
+		lavg = &zp->zone_loadavg;
+
+		zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
+		scalehrtime(&zone_total);
+
+		/* The zone_total should always be increasing. */
+		lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
+		    zone_total - lavg->lg_total : 0;
+		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
+		/* lg_total holds the prev. 1 sec. total */
+		lavg->lg_total = zone_total;
+
+		/*
+		 * To simplify the calculation, we don't calculate the load avg.
+		 * until the zone has been up for at least 10 seconds and our
+		 * moving average is thus full.
+		 */
+		if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
+			lavg->lg_len++;
+			mutex_exit(&zp->zone_lock);
+			continue;
+		}
+
+		/* Now calculate the 1min, 5min, 15 min load avg. */
+		hr_avg = 0;
+		for (i = 0; i < S_LOADAVG_SZ; i++)
+			hr_avg += lavg->lg_loads[i];
+		hr_avg = hr_avg / S_LOADAVG_SZ;
+		nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
+
+		/* Compute load avg. See comment in calcloadavg() */
+		for (i = 0; i < 3; i++) {
+			q = (zp->zone_hp_avenrun[i] >> 16) << 7;
+			r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
+			zp->zone_hp_avenrun[i] +=
+			    ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
+
+			/* avenrun[] can only hold 31 bits of load avg. */
+			if (zp->zone_hp_avenrun[i] <
+			    ((uint64_t)1<<(31+16-FSHIFT)))
+				zp->zone_avenrun[i] = (int32_t)
+				    (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
+			else
+				zp->zone_avenrun[i] = 0x7fffffff;
+		}
+
+		mutex_exit(&zp->zone_lock);
+	}
+	mutex_exit(&zonehash_lock);
+}
+
+/*
  * Get the number of cpus visible to this zone.  The system-wide global
  * 'ncpus' is returned if pools are disabled, the caller is in the
  * global zone, or a NULL zone argument is passed in.
@@ -3789,7 +4456,10 @@ zsched(void *arg)
 			mutex_enter(&zone_status_lock);
 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
 			mutex_exit(&zone_status_lock);
+		} else {
+			zone->zone_boot_time = gethrestime_sec();
 		}
+
 		pool_unlock();
 	}
 
@@ -4081,7 +4751,7 @@ zone_create(const char *zone_name, const char *zone_root,
     caddr_t rctlbuf, size_t rctlbufsz,
     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
     int match, uint32_t doi, const bslabel_t *label,
-    int flags)
+    int flags, zoneid_t zone_did)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -4104,6 +4774,7 @@ zone_create(const char *zone_name, const char *zone_root,
 
 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
 	zoneid = zone->zone_id = id_alloc(zoneid_space);
+	zone->zone_did = zone_did;
 	zone->zone_status = ZONE_IS_UNINITIALIZED;
 	zone->zone_pool = pool_default;
 	zone->zone_pool_mod = gethrtime();
@@ -4172,10 +4843,14 @@ zone_create(const char *zone_name, const char *zone_root,
 	zone->zone_locked_mem_ctl = UINT64_MAX;
 	zone->zone_max_swap = 0;
 	zone->zone_max_swap_ctl = UINT64_MAX;
+	zone->zone_phys_mem = 0;
+	zone->zone_phys_mem_ctl = UINT64_MAX;
 	zone->zone_max_lofi = 0;
 	zone->zone_max_lofi_ctl = UINT64_MAX;
-	zone0.zone_lockedmem_kstat = NULL;
-	zone0.zone_swapresv_kstat = NULL;
+	zone->zone_lockedmem_kstat = NULL;
+	zone->zone_swapresv_kstat = NULL;
+	zone->zone_physmem_kstat = NULL;
+	zone->zone_zfs_io_pri = 1;
 
 	/*
 	 * Zsched initializes the rctls.
@@ -4229,7 +4904,7 @@ zone_create(const char *zone_name, const char *zone_root,
 		return (zone_create_error(error, 0, extended_error));
 	}
 
-	if (block_mounts() == 0) {
+	if (block_mounts(zone) == 0) {
 		mutex_enter(&pp->p_lock);
 		if (curthread != pp->p_agenttp)
 			continuelwps(pp);
@@ -4380,7 +5055,7 @@ zone_create(const char *zone_name, const char *zone_root,
 	/*
 	 * The zone is fully visible, so we can let mounts progress.
 	 */
-	resume_mounts();
+	resume_mounts(zone);
 	if (rctls)
 		nvlist_free(rctls);
 
@@ -4396,7 +5071,7 @@ errout:
 		continuelwps(pp);
 	mutex_exit(&pp->p_lock);
 
-	resume_mounts();
+	resume_mounts(zone);
 	if (rctls)
 		nvlist_free(rctls);
 	/*
@@ -4474,6 +5149,7 @@ zone_boot(zoneid_t zoneid)
 static int
 zone_empty(zone_t *zone)
 {
+	int cnt = 0;
 	int waitstatus;
 
 	/*
@@ -4484,7 +5160,16 @@ zone_empty(zone_t *zone)
 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 	while ((waitstatus = zone_status_timedwait_sig(zone,
 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
-		killall(zone->zone_id);
+		boolean_t force = B_FALSE;
+
+		/* Every 30 seconds, try harder */
+		if (cnt++ >= 30) {
+			cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+			    zone->zone_id);
+			force = B_TRUE;
+			cnt = 0;
+		}
+		killall(zone->zone_id, force);
 	}
 	/*
 	 * return EINTR if we were signaled
@@ -4551,15 +5236,6 @@ zone_shutdown(zoneid_t zoneid)
 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
 		return (set_errno(EINVAL));
 
-	/*
-	 * Block mounts so that VFS_MOUNT() can get an accurate view of
-	 * the zone's status with regards to ZONE_IS_SHUTTING down.
-	 *
-	 * e.g. NFS can fail the mount if it determines that the zone
-	 * has already begun the shutdown sequence.
-	 */
-	if (block_mounts() == 0)
-		return (set_errno(EINTR));
 	mutex_enter(&zonehash_lock);
 	/*
 	 * Look for zone under hash lock to prevent races with other
@@ -4567,9 +5243,30 @@ zone_shutdown(zoneid_t zoneid)
 	 */
 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
 		mutex_exit(&zonehash_lock);
-		resume_mounts();
 		return (set_errno(EINVAL));
 	}
+
+	/*
+	 * We have to drop zonehash_lock before calling block_mounts.
+	 * Hold the zone so we can continue to use the zone_t.
+	 */
+	zone_hold(zone);
+	mutex_exit(&zonehash_lock);
+
+	/*
+	 * Block mounts so that VFS_MOUNT() can get an accurate view of
+	 * the zone's status with regards to ZONE_IS_SHUTTING down.
+	 *
+	 * e.g. NFS can fail the mount if it determines that the zone
+	 * has already begun the shutdown sequence.
+	 *
+	 */
+	if (block_mounts(zone) == 0) {
+		zone_rele(zone);
+		return (set_errno(EINTR));
+	}
+
+	mutex_enter(&zonehash_lock);
 	mutex_enter(&zone_status_lock);
 	status = zone_status_get(zone);
 	/*
@@ -4578,7 +5275,8 @@ zone_shutdown(zoneid_t zoneid)
 	if (status < ZONE_IS_READY) {
 		mutex_exit(&zone_status_lock);
 		mutex_exit(&zonehash_lock);
-		resume_mounts();
+		resume_mounts(zone);
+		zone_rele(zone);
 		return (set_errno(EINVAL));
 	}
 	/*
@@ -4588,7 +5286,8 @@ zone_shutdown(zoneid_t zoneid)
 	if (status >= ZONE_IS_DOWN) {
 		mutex_exit(&zone_status_lock);
 		mutex_exit(&zonehash_lock);
-		resume_mounts();
+		resume_mounts(zone);
+		zone_rele(zone);
 		return (0);
 	}
 	/*
@@ -4623,10 +5322,9 @@ zone_shutdown(zoneid_t zoneid)
 			}
 		}
 	}
-	zone_hold(zone);	/* so we can use the zone_t later */
 	mutex_exit(&zone_status_lock);
 	mutex_exit(&zonehash_lock);
-	resume_mounts();
+	resume_mounts(zone);
 
 	if (error = zone_empty(zone)) {
 		zone_rele(zone);
@@ -5222,14 +5920,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 				error = EFAULT;
 		}
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		size = sizeof (zone->zone_phys_mcap);
-		if (bufsize > size)
-			bufsize = size;
-		if (buf != NULL &&
-		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-			error = EFAULT;
-		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		mutex_enter(&class_lock);
 
@@ -5284,6 +5974,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		}
 		kmem_free(zbuf, bufsize);
 		break;
+	case ZONE_ATTR_DID:
+		size = sizeof (zoneid_t);
+		if (bufsize > size)
+			bufsize = size;
+
+		if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+			error = EFAULT;
+		break;
 	default:
 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
 			size = bufsize;
@@ -5315,10 +6013,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EPERM));
 
 	/*
-	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-	 * global zone.
+	 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+	 * attributes can be set on the global zone.
 	 */
-	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+	if (zoneid == GLOBAL_ZONEID &&
+	    attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
 		return (set_errno(EINVAL));
 	}
 
@@ -5335,7 +6034,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	 * non-global zones.
 	 */
 	zone_status = zone_status_get(zone);
-	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+	if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+	    attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+	    zone_status > ZONE_IS_READY) {
 		err = EINVAL;
 		goto done;
 	}
@@ -5344,6 +6045,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_INITNAME:
 		err = zone_set_initname(zone, (const char *)buf);
 		break;
+	case ZONE_ATTR_INITNORESTART:
+		zone->zone_restart_init = B_FALSE;
+		err = 0;
+		break;
 	case ZONE_ATTR_BOOTARGS:
 		err = zone_set_bootargs(zone, (const char *)buf);
 		break;
@@ -5353,8 +6058,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 	case ZONE_ATTR_FS_ALLOWED:
 		err = zone_set_fs_allowed(zone, (const char *)buf);
 		break;
-	case ZONE_ATTR_PHYS_MCAP:
-		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+	case ZONE_ATTR_PMCAP_NOVER:
+		err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PMCAP_PAGEOUT:
+		err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+		break;
+	case ZONE_ATTR_PG_FLT_DELAY:
+		err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+		break;
+	case ZONE_ATTR_RSS:
+		err = zone_set_rss(zone, (const uint64_t *)buf);
 		break;
 	case ZONE_ATTR_SCHED_CLASS:
 		err = zone_set_sched_class(zone, (const char *)buf);
@@ -6075,6 +6789,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 			zs.doi = zs32.doi;
 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
 			zs.flags = zs32.flags;
+			zs.zoneid = zs32.zoneid;
 #else
 			panic("get_udatamodel() returned bogus result\n");
 #endif
@@ -6085,7 +6800,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
 		    zs.extended_error, zs.match, zs.doi,
-		    zs.label, zs.flags));
+		    zs.label, zs.flags, zs.zoneid));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
 	case ZONE_DESTROY:
@@ -6363,7 +7078,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
 	 * later.
 	 */
-	killall(zone->zone_id);
+	killall(zone->zone_id, B_FALSE);
 	/*
 	 * Now, create the thread to contact zoneadmd and do the rest of the
 	 * work.  This thread can't be created in our zone otherwise
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 271682bc67..50846d0cb3 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2012, Joyent, Inc. All rights reserved.
 #
 
 include $(SRC)/uts/Makefile.uts
@@ -277,6 +278,7 @@ CHKHDRS=			\
 	ipc.h			\
 	ipc_impl.h		\
 	ipc_rctl.h		\
+	ipd.h			\
 	ipmi.h			\
 	isa_defs.h		\
 	iscsi_authclient.h	\
@@ -856,6 +858,8 @@ FSHDRS=				\
 	hsfs_rrip.h		\
 	hsfs_spec.h		\
 	hsfs_susp.h		\
+	hyprlofs.h		\
+	hyprlofs_info.h		\
 	lofs_info.h		\
 	lofs_node.h		\
 	mntdata.h		\
diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h
index 8363d231cf..e0cfd6f778 100644
--- a/usr/src/uts/common/sys/aggr_impl.h
+++ b/usr/src/uts/common/sys/aggr_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 OmniTI Computer Consulting, Inc  All rights reserved.
  */
 
 #ifndef	_SYS_AGGR_IMPL_H
@@ -307,6 +308,8 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *);
 extern void aggr_port_init_callbacks(aggr_port_t *);
 
 extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *,
+    boolean_t);
 
 extern void aggr_tx_ring_update(void *, uintptr_t);
 extern void aggr_tx_notify_thread(void *);
diff --git a/usr/src/uts/common/sys/blkdev.h b/usr/src/uts/common/sys/blkdev.h
index 2307610bae..4ec50fbf3b 100644
--- a/usr/src/uts/common/sys/blkdev.h
+++ b/usr/src/uts/common/sys/blkdev.h
@@ -19,6 +19,7 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
@@ -116,6 +117,7 @@ struct bd_media {
 	uint64_t		m_nblks;
 	uint32_t		m_blksize;
 	boolean_t		m_readonly;
+	boolean_t		m_solidstate;
 };
 
 #define	BD_INFO_FLAG_REMOVABLE		(1U << 0)
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index a9191aed7c..cb8a6012fc 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -186,6 +187,7 @@ struct biostats {
 #define	B_STARTED	0x2000000	/* io:::start probe called for buf */
 #define	B_ABRWRITE	0x4000000	/* Application based recovery active */
 #define	B_PAGE_NOWAIT	0x8000000	/* Skip the page if it is locked */
+#define	B_INVALCURONLY	0x10000000	/* invalidate only for curproc */
 
 /*
  * There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -198,6 +200,12 @@ struct biostats {
  * between the sole use of these two flags.  In both cases, IO will be done
  * if the page is not yet committed to storage.
  *
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL.  B_INVALCURONLY has no
+ * meaning on its own.  When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
  * should be used.
  *
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
index 6063ff4380..6bc042108c 100644
--- a/usr/src/uts/common/sys/cpucaps.h
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_CPUCAPS_H
@@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *);
  */
 extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
 extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);
 
 /*
  * Get current CPU usage for a project/zone.
  */
 extern rctl_qty_t cpucaps_project_get(kproject_t *);
 extern rctl_qty_t cpucaps_zone_get(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_base(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);
 
 /*
  * Scheduling class hooks into CPU caps framework.
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
index 95afd21827..2cd4ed644d 100644
--- a/usr/src/uts/common/sys/cpucaps_impl.h
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_CPUCAPS_IMPL_H
@@ -66,8 +67,12 @@ typedef struct cpucap {
 	waitq_t		cap_waitq;	/* waitq for capped threads	*/
 	kstat_t		*cap_kstat;	/* cpucaps specific kstat	*/
 	int64_t		cap_gen;	/* zone cap specific 		*/
+	hrtime_t	cap_chk_value;	/* effective CPU usage cap	*/
 	hrtime_t	cap_value;	/* scaled CPU usage cap		*/
 	hrtime_t	cap_usage;	/* current CPU usage		*/
+	hrtime_t	cap_base;	/* base CPU for burst		*/
+	u_longlong_t	cap_burst_limit; /* max secs (in tics) for a burst */
+	u_longlong_t	cap_bursting;	/* # of ticks currently bursting */
 	disp_lock_t	cap_usagelock;	/* protects cap_usage above	*/
 	/*
 	 * Per cap statistics.
@@ -75,6 +80,7 @@ typedef struct cpucap {
 	hrtime_t	cap_maxusage;	/* maximum cap usage		*/
 	u_longlong_t	cap_below;	/* # of ticks spend below the cap */
 	u_longlong_t	cap_above;	/* # of ticks spend above the cap */
+	u_longlong_t	cap_above_base;	/* # of ticks spent above the base */
 } cpucap_t;
 
 /*
diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h
index 5056f9a511..914f132dc0 100644
--- a/usr/src/uts/common/sys/cred.h
+++ b/usr/src/uts/common/sys/cred.h
@@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *);
 extern gid_t crgetrgid(const cred_t *);
 extern gid_t crgetsgid(const cred_t *);
 extern zoneid_t crgetzoneid(const cred_t *);
+extern zoneid_t crgetzonedid(const cred_t *);
 extern projid_t crgetprojid(const cred_t *);
 
 extern cred_t *crgetmapped(const cred_t *);
diff --git a/usr/src/uts/common/sys/dkio.h b/usr/src/uts/common/sys/dkio.h
index eb4ddf34fe..a5b0c312f9 100644
--- a/usr/src/uts/common/sys/dkio.h
+++ b/usr/src/uts/common/sys/dkio.h
@@ -23,6 +23,7 @@
  * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved.
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
  */
 
 #ifndef _SYS_DKIO_H
@@ -237,6 +238,9 @@ struct dk_callback {
 #define	DKIOCSETEXTPART	(DKIOC|46)
 #endif
 
+/* ioctl to report whether the disk is solid state or not - used for ZFS */
+#define	DKIOCSOLIDSTATE		(DKIOC|38)
+
 /*
  * Ioctl to force driver to re-read the alternate partition and rebuild
  * the internal defect map.
diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h
index f5c990e7c0..2178ad1f0d 100644
--- a/usr/src/uts/common/sys/dktp/dadk.h
+++ b/usr/src/uts/common/sys/dktp/dadk.h
@@ -65,6 +65,8 @@ struct	dadk {
 	kstat_t		*dad_errstats;	/* error stats			*/
 	kmutex_t	dad_cmd_mutex;
 	int		dad_cmd_count;
+	uint32_t	dad_err_cnt;	/* number of recent errors	*/
+	hrtime_t	dad_last_log;	/* time of last error log	*/
 };
 
 #define	DAD_SECSIZ	dad_phyg.g_secsiz
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index fb2a0749d3..303a9c7e45 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_DLD_H
@@ -191,6 +192,7 @@ typedef struct dld_ioc_rename {
 	datalink_id_t	dir_linkid1;
 	datalink_id_t	dir_linkid2;
 	char		dir_link[MAXLINKNAMELEN];
+	boolean_t	dir_zoneinit;
 } dld_ioc_rename_t;
 
 /*
@@ -203,6 +205,7 @@ typedef struct dld_ioc_rename {
 typedef struct dld_ioc_zid {
 	zoneid_t	diz_zid;
 	datalink_id_t	diz_linkid;
+	boolean_t	diz_transient;
 } dld_ioc_zid_t;
 
 /*
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 6bd2bbe35a..adcfe76c08 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_DLS_H
@@ -110,7 +111,7 @@ extern void		dls_devnet_close(dls_dl_handle_t);
 extern boolean_t	dls_devnet_rebuild();
 
 extern int		dls_devnet_rename(datalink_id_t, datalink_id_t,
-			    const char *);
+			    const char *, boolean_t);
 extern int		dls_devnet_create(mac_handle_t, datalink_id_t,
 			    zoneid_t);
 extern int		dls_devnet_destroy(mac_handle_t, datalink_id_t *,
@@ -127,7 +128,7 @@ extern uint16_t		dls_devnet_vid(dls_dl_handle_t);
 extern datalink_id_t	dls_devnet_linkid(dls_dl_handle_t);
 extern int		dls_devnet_dev2linkid(dev_t, datalink_id_t *);
 extern int		dls_devnet_phydev(datalink_id_t, dev_t *);
-extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t);
+extern int		dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t);
 extern zoneid_t		dls_devnet_getzid(dls_dl_handle_t);
 extern zoneid_t		dls_devnet_getownerzid(dls_dl_handle_t);
 extern boolean_t	dls_devnet_islinkvisible(datalink_id_t, zoneid_t);
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 60f51c47b5..8f7af6856c 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_DLS_IMPL_H
@@ -96,7 +97,8 @@ extern void		dls_create_str_kstats(dld_str_t *);
 extern int		dls_stat_update(kstat_t *, dls_link_t *, int);
 extern int		dls_stat_create(const char *, int, const char *,
 			    zoneid_t, int (*)(struct kstat *, int), void *,
-			    kstat_t **);
+			    kstat_t **, zoneid_t);
+extern void	dls_stat_delete(kstat_t *);
 
 extern int		dls_devnet_open_by_dev(dev_t, dls_link_t **,
 			    dls_dl_handle_t *);
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
index b4032c24d6..4f73d92118 100644
--- a/usr/src/uts/common/sys/dls_mgmt.h
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
  */
 
 #ifndef	_DLS_MGMT_H
@@ -165,6 +166,7 @@ typedef struct dlmgmt_door_getname {
 typedef struct dlmgmt_door_getlinkid {
 	int			ld_cmd;
 	char			ld_link[MAXLINKNAMELEN];
+	zoneid_t		ld_zoneid;
 } dlmgmt_door_getlinkid_t;
 
 typedef struct dlmgmt_door_getnext_s {
diff --git a/usr/src/uts/common/sys/dtrace.h b/usr/src/uts/common/sys/dtrace.h
index fd7612f88a..e6d9e0e675 100644
--- a/usr/src/uts/common/sys/dtrace.h
+++ b/usr/src/uts/common/sys/dtrace.h
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
@@ -36,16 +36,16 @@
 extern "C" {
 #endif
 
-/*
- * DTrace Dynamic Tracing Software: Kernel Interfaces
- *
- * Note: The contents of this file are private to the implementation of the
- * Solaris system and DTrace subsystem and are subject to change at any time
- * without notice.  Applications and drivers using these interfaces will fail
- * to run on future releases.  These interfaces should not be used for any
- * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB).
- * Please refer to the "Solaris Dynamic Tracing Guide" for more information.
- */
+	/*
+	 * DTrace Dynamic Tracing Software: Kernel Interfaces
+	 *
+	 * Note: The contents of this file are private to the implementation of the
+	 * Solaris system and DTrace subsystem and are subject to change at any time
+	 * without notice.  Applications and drivers using these interfaces will fail
+	 * to run on future releases.  These interfaces should not be used for any
+	 * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB).
+	 * Please refer to the "Solaris Dynamic Tracing Guide" for more information.
+	 */
 
 #ifndef _ASM
 
@@ -57,9 +57,9 @@ extern "C" {
 #include <sys/cyclic.h>
 #include <sys/int_limits.h>
 
-/*
- * DTrace Universal Constants and Typedefs
- */
+	/*
+	 * DTrace Universal Constants and Typedefs
+	 */
 #define	DTRACE_CPUALL		-1	/* all CPUs */
 #define	DTRACE_IDNONE		0	/* invalid probe identifier */
 #define	DTRACE_EPIDNONE		0	/* invalid enabled probe identifier */
@@ -75,35 +75,35 @@ extern "C" {
 #define	DTRACE_FUNCNAMELEN	128
 #define	DTRACE_NAMELEN		64
 #define	DTRACE_FULLNAMELEN	(DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \
-				DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4)
+		DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4)
 #define	DTRACE_ARGTYPELEN	128
 
-typedef uint32_t dtrace_id_t;		/* probe identifier */
-typedef uint32_t dtrace_epid_t;		/* enabled probe identifier */
-typedef uint32_t dtrace_aggid_t;	/* aggregation identifier */
-typedef int64_t dtrace_aggvarid_t;	/* aggregation variable identifier */
-typedef uint16_t dtrace_actkind_t;	/* action kind */
-typedef int64_t dtrace_optval_t;	/* option value */
-typedef uint32_t dtrace_cacheid_t;	/* predicate cache identifier */
-
-typedef enum dtrace_probespec {
-	DTRACE_PROBESPEC_NONE = -1,
-	DTRACE_PROBESPEC_PROVIDER = 0,
-	DTRACE_PROBESPEC_MOD,
-	DTRACE_PROBESPEC_FUNC,
-	DTRACE_PROBESPEC_NAME
-} dtrace_probespec_t;
-
-/*
- * DTrace Intermediate Format (DIF)
- *
- * The following definitions describe the DTrace Intermediate Format (DIF), a
- * a RISC-like instruction set and program encoding used to represent
- * predicates and actions that can be bound to DTrace probes.  The constants
- * below defining the number of available registers are suggested minimums; the
- * compiler should use DTRACEIOC_CONF to dynamically obtain the number of
- * registers provided by the current DTrace implementation.
- */
+	typedef uint32_t dtrace_id_t;		/* probe identifier */
+	typedef uint32_t dtrace_epid_t;		/* enabled probe identifier */
+	typedef uint32_t dtrace_aggid_t;	/* aggregation identifier */
+	typedef int64_t dtrace_aggvarid_t;	/* aggregation variable identifier */
+	typedef uint16_t dtrace_actkind_t;	/* action kind */
+	typedef int64_t dtrace_optval_t;	/* option value */
+	typedef uint32_t dtrace_cacheid_t;	/* predicate cache identifier */
+
+	typedef enum dtrace_probespec {
+		DTRACE_PROBESPEC_NONE = -1,
+		DTRACE_PROBESPEC_PROVIDER = 0,
+		DTRACE_PROBESPEC_MOD,
+		DTRACE_PROBESPEC_FUNC,
+		DTRACE_PROBESPEC_NAME
+	} dtrace_probespec_t;
+
+	/*
+	 * DTrace Intermediate Format (DIF)
+	 *
+	 * The following definitions describe the DTrace Intermediate Format (DIF), a
+	 * a RISC-like instruction set and program encoding used to represent
+	 * predicates and actions that can be bound to DTrace probes.  The constants
+	 * below defining the number of available registers are suggested minimums; the
+	 * compiler should use DTRACEIOC_CONF to dynamically obtain the number of
+	 * registers provided by the current DTrace implementation.
+	 */
 #define	DIF_VERSION_1	1		/* DIF version 1: Solaris 10 Beta */
 #define	DIF_VERSION_2	2		/* DIF version 2: Solaris 10 FCS */
 #define	DIF_VERSION	DIF_VERSION_2	/* latest DIF instruction set version */
@@ -288,10 +288,11 @@ typedef enum dtrace_probespec {
 #define	DIF_SUBR_INET_NTOA6		43
 #define	DIF_SUBR_TOUPPER		44
 #define	DIF_SUBR_TOLOWER		45
+#define	DIF_SUBR_GETF			46
 
-#define	DIF_SUBR_MAX			45	/* max subroutine value */
+#define	DIF_SUBR_MAX			46	/* max subroutine value */
 
-typedef uint32_t dif_instr_t;
+	typedef uint32_t dif_instr_t;
 
 #define	DIF_INSTR_OP(i)			(((i) >> 24) & 0xff)
 #define	DIF_INSTR_R1(i)			(((i) >> 16) & 0xff)
@@ -333,39 +334,39 @@ typedef uint32_t dif_instr_t;
 
 #define	DIF_REG_R0	0		/* %r0 is always set to zero */
 
-/*
- * A DTrace Intermediate Format Type (DIF Type) is used to represent the types
- * of variables, function and associative array arguments, and the return type
- * for each DIF object (shown below).  It contains a description of the type,
- * its size in bytes, and a module identifier.
- */
-typedef struct dtrace_diftype {
-	uint8_t dtdt_kind;		/* type kind (see below) */
-	uint8_t dtdt_ckind;		/* type kind in CTF */
-	uint8_t dtdt_flags;		/* type flags (see below) */
-	uint8_t dtdt_pad;		/* reserved for future use */
-	uint32_t dtdt_size;		/* type size in bytes (unless string) */
-} dtrace_diftype_t;
+	/*
+	 * A DTrace Intermediate Format Type (DIF Type) is used to represent the types
+	 * of variables, function and associative array arguments, and the return type
+	 * for each DIF object (shown below).  It contains a description of the type,
+	 * its size in bytes, and a module identifier.
+	 */
+	typedef struct dtrace_diftype {
+		uint8_t dtdt_kind;		/* type kind (see below) */
+		uint8_t dtdt_ckind;		/* type kind in CTF */
+		uint8_t dtdt_flags;		/* type flags (see below) */
+		uint8_t dtdt_pad;		/* reserved for future use */
+		uint32_t dtdt_size;		/* type size in bytes (unless string) */
+	} dtrace_diftype_t;
 
 #define	DIF_TYPE_CTF		0	/* type is a CTF type */
 #define	DIF_TYPE_STRING		1	/* type is a D string */
 
 #define	DIF_TF_BYREF		0x1	/* type is passed by reference */
 
-/*
- * A DTrace Intermediate Format variable record is used to describe each of the
- * variables referenced by a given DIF object.  It contains an integer variable
- * identifier along with variable scope and properties, as shown below.  The
- * size of this structure must be sizeof (int) aligned.
- */
-typedef struct dtrace_difv {
-	uint32_t dtdv_name;		/* variable name index in dtdo_strtab */
-	uint32_t dtdv_id;		/* variable reference identifier */
-	uint8_t dtdv_kind;		/* variable kind (see below) */
-	uint8_t dtdv_scope;		/* variable scope (see below) */
-	uint16_t dtdv_flags;		/* variable flags (see below) */
-	dtrace_diftype_t dtdv_type;	/* variable type (see above) */
-} dtrace_difv_t;
+	/*
+	 * A DTrace Intermediate Format variable record is used to describe each of the
+	 * variables referenced by a given DIF object.  It contains an integer variable
+	 * identifier along with variable scope and properties, as shown below.  The
+	 * size of this structure must be sizeof (int) aligned.
+	 */
+	typedef struct dtrace_difv {
+		uint32_t dtdv_name;		/* variable name index in dtdo_strtab */
+		uint32_t dtdv_id;		/* variable reference identifier */
+		uint8_t dtdv_kind;		/* variable kind (see below) */
+		uint8_t dtdv_scope;		/* variable scope (see below) */
+		uint16_t dtdv_flags;		/* variable flags (see below) */
+		dtrace_diftype_t dtdv_type;	/* variable type (see above) */
+	} dtrace_difv_t;
 
 #define	DIFV_KIND_ARRAY		0	/* variable is an array of quantities */
 #define	DIFV_KIND_SCALAR	1	/* variable is a scalar quantity */
@@ -377,21 +378,21 @@ typedef struct dtrace_difv {
 #define	DIFV_F_REF		0x1	/* variable is referenced by DIFO */
 #define	DIFV_F_MOD		0x2	/* variable is written by DIFO */
 
-/*
- * DTrace Actions
- *
- * The upper byte determines the class of the action; the low bytes determines
- * the specific action within that class.  The classes of actions are as
- * follows:
- *
- *   [ no class ]                  <= May record process- or kernel-related data
- *   DTRACEACT_PROC                <= Only records process-related data
- *   DTRACEACT_PROC_DESTRUCTIVE    <= Potentially destructive to processes
- *   DTRACEACT_KERNEL              <= Only records kernel-related data
- *   DTRACEACT_KERNEL_DESTRUCTIVE  <= Potentially destructive to the kernel
- *   DTRACEACT_SPECULATIVE         <= Speculation-related action
- *   DTRACEACT_AGGREGATION         <= Aggregating action
- */
+	/*
+	 * DTrace Actions
+	 *
+	 * The upper byte determines the class of the action; the low bytes determines
+	 * the specific action within that class.  The classes of actions are as
+	 * follows:
+	 *
+	 *   [ no class ]                  <= May record process- or kernel-related data
+	 *   DTRACEACT_PROC                <= Only records process-related data
+	 *   DTRACEACT_PROC_DESTRUCTIVE    <= Potentially destructive to processes
+	 *   DTRACEACT_KERNEL              <= Only records kernel-related data
+	 *   DTRACEACT_KERNEL_DESTRUCTIVE  <= Potentially destructive to the kernel
+	 *   DTRACEACT_SPECULATIVE         <= Speculation-related action
+	 *   DTRACEACT_AGGREGATION         <= Aggregating action
+	 */
 #define	DTRACEACT_NONE			0	/* no action */
 #define	DTRACEACT_DIFEXPR		1	/* action is DIF expression */
 #define	DTRACEACT_EXIT			2	/* exit() action */
@@ -435,27 +436,27 @@ typedef struct dtrace_difv {
 
 #define	DTRACEACT_ISDESTRUCTIVE(x)	\
 	(DTRACEACT_CLASS(x) == DTRACEACT_PROC_DESTRUCTIVE || \
-	DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE)
+	 DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE)
 
 #define	DTRACEACT_ISSPECULATIVE(x)	\
 	(DTRACEACT_CLASS(x) == DTRACEACT_SPECULATIVE)
 
 #define	DTRACEACT_ISPRINTFLIKE(x)	\
 	((x) == DTRACEACT_PRINTF || (x) == DTRACEACT_PRINTA || \
-	(x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN)
-
-/*
- * DTrace Aggregating Actions
- *
- * These are functions f(x) for which the following is true:
- *
- *    f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n)
- *
- * where x_n is a set of arbitrary data.  Aggregating actions are in their own
- * DTrace action class, DTTRACEACT_AGGREGATION.  The macros provided here allow
- * for easier processing of the aggregation argument and data payload for a few
- * aggregating actions (notably:  quantize(), lquantize(), and ustack()).
- */
+	 (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN)
+
+	/*
+	 * DTrace Aggregating Actions
+	 *
+	 * These are functions f(x) for which the following is true:
+	 *
+	 *    f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n)
+	 *
+	 * where x_n is a set of arbitrary data.  Aggregating actions are in their own
+	 * DTrace action class, DTTRACEACT_AGGREGATION.  The macros provided here allow
+	 * for easier processing of the aggregation argument and data payload for a few
+	 * aggregating actions (notably:  quantize(), lquantize(), and ustack()).
+	 */
 #define	DTRACEACT_AGGREGATION		0x0700
 #define	DTRACEAGG_COUNT			(DTRACEACT_AGGREGATION + 1)
 #define	DTRACEAGG_MIN			(DTRACEACT_AGGREGATION + 2)
@@ -477,9 +478,9 @@ typedef struct dtrace_difv {
 
 #define	DTRACE_QUANTIZE_BUCKETVAL(buck)					\
 	(int64_t)((buck) < DTRACE_QUANTIZE_ZEROBUCKET ?			\
-	-(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) :		\
-	(buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 :			\
-	1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1))
+			-(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) :		\
+			(buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 :			\
+			1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1))
 
 #define	DTRACE_LQUANTIZE_STEPSHIFT		48
 #define	DTRACE_LQUANTIZE_STEPMASK		((uint64_t)UINT16_MAX << 48)
@@ -490,15 +491,15 @@ typedef struct dtrace_difv {
 
 #define	DTRACE_LQUANTIZE_STEP(x)		\
 	(uint16_t)(((x) & DTRACE_LQUANTIZE_STEPMASK) >> \
-	DTRACE_LQUANTIZE_STEPSHIFT)
+			DTRACE_LQUANTIZE_STEPSHIFT)
 
 #define	DTRACE_LQUANTIZE_LEVELS(x)		\
 	(uint16_t)(((x) & DTRACE_LQUANTIZE_LEVELMASK) >> \
-	DTRACE_LQUANTIZE_LEVELSHIFT)
+			DTRACE_LQUANTIZE_LEVELSHIFT)
 
 #define	DTRACE_LQUANTIZE_BASE(x)		\
 	(int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \
-	DTRACE_LQUANTIZE_BASESHIFT)
+			DTRACE_LQUANTIZE_BASESHIFT)
 
 #define	DTRACE_LLQUANTIZE_FACTORSHIFT		48
 #define	DTRACE_LLQUANTIZE_FACTORMASK		((uint64_t)UINT16_MAX << 48)
@@ -511,19 +512,19 @@ typedef struct dtrace_difv {
 
 #define	DTRACE_LLQUANTIZE_FACTOR(x)		\
 	(uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \
-	DTRACE_LLQUANTIZE_FACTORSHIFT)
+			DTRACE_LLQUANTIZE_FACTORSHIFT)
 
 #define	DTRACE_LLQUANTIZE_LOW(x)		\
 	(uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \
-	DTRACE_LLQUANTIZE_LOWSHIFT)
+			DTRACE_LLQUANTIZE_LOWSHIFT)
 
 #define	DTRACE_LLQUANTIZE_HIGH(x)		\
 	(uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \
-	DTRACE_LLQUANTIZE_HIGHSHIFT)
+			DTRACE_LLQUANTIZE_HIGHSHIFT)
 
 #define	DTRACE_LLQUANTIZE_NSTEP(x)		\
 	(uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \
-	DTRACE_LLQUANTIZE_NSTEPSHIFT)
+			DTRACE_LLQUANTIZE_NSTEPSHIFT)
 
 #define	DTRACE_USTACK_NFRAMES(x)	(uint32_t)((x) & UINT32_MAX)
 #define	DTRACE_USTACK_STRSIZE(x)	(uint32_t)((x) >> 32)
@@ -540,72 +541,72 @@ typedef struct dtrace_difv {
 #define	DTRACE_PTR(type, name)	type *name
 #endif
 
-/*
- * DTrace Object Format (DOF)
- *
- * DTrace programs can be persistently encoded in the DOF format so that they
- * may be embedded in other programs (for example, in an ELF file) or in the
- * dtrace driver configuration file for use in anonymous tracing.  The DOF
- * format is versioned and extensible so that it can be revised and so that
- * internal data structures can be modified or extended compatibly.  All DOF
- * structures use fixed-size types, so the 32-bit and 64-bit representations
- * are identical and consumers can use either data model transparently.
- *
- * The file layout is structured as follows:
- *
- * +---------------+-------------------+----- ... ----+---- ... ------+
- * |   dof_hdr_t   |  dof_sec_t[ ... ] |   loadable   | non-loadable  |
- * | (file header) | (section headers) | section data | section data  |
- * +---------------+-------------------+----- ... ----+---- ... ------+
- * |<------------ dof_hdr.dofh_loadsz --------------->|               |
- * |<------------ dof_hdr.dofh_filesz ------------------------------->|
- *
- * The file header stores meta-data including a magic number, data model for
- * the instrumentation, data encoding, and properties of the DIF code within.
- * The header describes its own size and the size of the section headers.  By
- * convention, an array of section headers follows the file header, and then
- * the data for all loadable sections and unloadable sections.  This permits
- * consumer code to easily download the headers and all loadable data into the
- * DTrace driver in one contiguous chunk, omitting other extraneous sections.
- *
- * The section headers describe the size, offset, alignment, and section type
- * for each section.  Sections are described using a set of #defines that tell
- * the consumer what kind of data is expected.  Sections can contain links to
- * other sections by storing a dof_secidx_t, an index into the section header
- * array, inside of the section data structures.  The section header includes
- * an entry size so that sections with data arrays can grow their structures.
- *
- * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which
- * are represented themselves as a collection of related DOF sections.  This
- * permits us to change the set of sections associated with a DIFO over time,
- * and also permits us to encode DIFOs that contain different sets of sections.
- * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a
- * section of type DOF_SECT_DIFOHDR.  This section's data is then an array of
- * dof_secidx_t's which in turn denote the sections associated with this DIFO.
- *
- * This loose coupling of the file structure (header and sections) to the
- * structure of the DTrace program itself (ECB descriptions, action
- * descriptions, and DIFOs) permits activities such as relocation processing
- * to occur in a single pass without having to understand D program structure.
- *
- * Finally, strings are always stored in ELF-style string tables along with a
- * string table section index and string table offset.  Therefore strings in
- * DOF are always arbitrary-length and not bound to the current implementation.
- */
+	/*
+	 * DTrace Object Format (DOF)
+	 *
+	 * DTrace programs can be persistently encoded in the DOF format so that they
+	 * may be embedded in other programs (for example, in an ELF file) or in the
+	 * dtrace driver configuration file for use in anonymous tracing.  The DOF
+	 * format is versioned and extensible so that it can be revised and so that
+	 * internal data structures can be modified or extended compatibly.  All DOF
+	 * structures use fixed-size types, so the 32-bit and 64-bit representations
+	 * are identical and consumers can use either data model transparently.
+	 *
+	 * The file layout is structured as follows:
+	 *
+	 * +---------------+-------------------+----- ... ----+---- ... ------+
+	 * |   dof_hdr_t   |  dof_sec_t[ ... ] |   loadable   | non-loadable  |
+	 * | (file header) | (section headers) | section data | section data  |
+	 * +---------------+-------------------+----- ... ----+---- ... ------+
+	 * |<------------ dof_hdr.dofh_loadsz --------------->|               |
+	 * |<------------ dof_hdr.dofh_filesz ------------------------------->|
+	 *
+	 * The file header stores meta-data including a magic number, data model for
+	 * the instrumentation, data encoding, and properties of the DIF code within.
+	 * The header describes its own size and the size of the section headers.  By
+	 * convention, an array of section headers follows the file header, and then
+	 * the data for all loadable sections and unloadable sections.  This permits
+	 * consumer code to easily download the headers and all loadable data into the
+	 * DTrace driver in one contiguous chunk, omitting other extraneous sections.
+	 *
+	 * The section headers describe the size, offset, alignment, and section type
+	 * for each section.  Sections are described using a set of #defines that tell
+	 * the consumer what kind of data is expected.  Sections can contain links to
+	 * other sections by storing a dof_secidx_t, an index into the section header
+	 * array, inside of the section data structures.  The section header includes
+	 * an entry size so that sections with data arrays can grow their structures.
+	 *
+	 * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which
+	 * are represented themselves as a collection of related DOF sections.  This
+	 * permits us to change the set of sections associated with a DIFO over time,
+	 * and also permits us to encode DIFOs that contain different sets of sections.
+	 * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a
+	 * section of type DOF_SECT_DIFOHDR.  This section's data is then an array of
+	 * dof_secidx_t's which in turn denote the sections associated with this DIFO.
+	 *
+	 * This loose coupling of the file structure (header and sections) to the
+	 * structure of the DTrace program itself (ECB descriptions, action
+	 * descriptions, and DIFOs) permits activities such as relocation processing
+	 * to occur in a single pass without having to understand D program structure.
+	 *
+	 * Finally, strings are always stored in ELF-style string tables along with a
+	 * string table section index and string table offset.  Therefore strings in
+	 * DOF are always arbitrary-length and not bound to the current implementation.
+	 */
 
 #define	DOF_ID_SIZE	16	/* total size of dofh_ident[] in bytes */
 
-typedef struct dof_hdr {
-	uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */
-	uint32_t dofh_flags;		/* file attribute flags (if any) */
-	uint32_t dofh_hdrsize;		/* size of file header in bytes */
-	uint32_t dofh_secsize;		/* size of section header in bytes */
-	uint32_t dofh_secnum;		/* number of section headers */
-	uint64_t dofh_secoff;		/* file offset of section headers */
-	uint64_t dofh_loadsz;		/* file size of loadable portion */
-	uint64_t dofh_filesz;		/* file size of entire DOF file */
-	uint64_t dofh_pad;		/* reserved for future use */
-} dof_hdr_t;
+	typedef struct dof_hdr {
+		uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */
+		uint32_t dofh_flags;		/* file attribute flags (if any) */
+		uint32_t dofh_hdrsize;		/* size of file header in bytes */
+		uint32_t dofh_secsize;		/* size of section header in bytes */
+		uint32_t dofh_secnum;		/* number of section headers */
+		uint64_t dofh_secoff;		/* file offset of section headers */
+		uint64_t dofh_loadsz;		/* file size of loadable portion */
+		uint64_t dofh_filesz;		/* file size of entire DOF file */
+		uint64_t dofh_pad;		/* reserved for future use */
+	} dof_hdr_t;
 
 #define	DOF_ID_MAG0	0	/* first byte of magic number */
 #define	DOF_ID_MAG1	1	/* second byte of magic number */
@@ -653,20 +654,20 @@ typedef struct dof_hdr {
 
 #define	DOF_FL_VALID	0	/* mask of all valid dofh_flags bits */
 
-typedef uint32_t dof_secidx_t;	/* section header table index type */
-typedef uint32_t dof_stridx_t;	/* string table index type */
+	typedef uint32_t dof_secidx_t;	/* section header table index type */
+	typedef uint32_t dof_stridx_t;	/* string table index type */
 
 #define	DOF_SECIDX_NONE	(-1U)	/* null value for section indices */
 #define	DOF_STRIDX_NONE	(-1U)	/* null value for string indices */
 
-typedef struct dof_sec {
-	uint32_t dofs_type;	/* section type (see below) */
-	uint32_t dofs_align;	/* section data memory alignment */
-	uint32_t dofs_flags;	/* section flags (if any) */
-	uint32_t dofs_entsize;	/* size of section entry (if table) */
-	uint64_t dofs_offset;	/* offset of section data within file */
-	uint64_t dofs_size;	/* size of section data in bytes */
-} dof_sec_t;
+	typedef struct dof_sec {
+		uint32_t dofs_type;	/* section type (see below) */
+		uint32_t dofs_align;	/* section data memory alignment */
+		uint32_t dofs_flags;	/* section flags (if any) */
+		uint32_t dofs_entsize;	/* size of section entry (if table) */
+		uint64_t dofs_offset;	/* offset of section data within file */
+		uint64_t dofs_size;	/* size of section data in bytes */
+	} dof_sec_t;
 
 #define	DOF_SECT_NONE		0	/* null section */
 #define	DOF_SECT_COMMENTS	1	/* compiler comments */
@@ -700,297 +701,297 @@ typedef struct dof_sec {
 
 #define	DOF_SEC_ISLOADABLE(x)						\
 	(((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) ||	\
-	((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) ||	\
-	((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) ||		\
-	((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) ||		\
-	((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) ||	\
-	((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) ||	\
-	((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) ||	\
-	((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) ||		\
-	((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) ||		\
-	((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) ||	\
-	((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) ||	\
-	((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
-
-typedef struct dof_ecbdesc {
-	dof_secidx_t dofe_probes;	/* link to DOF_SECT_PROBEDESC */
-	dof_secidx_t dofe_pred;		/* link to DOF_SECT_DIFOHDR */
-	dof_secidx_t dofe_actions;	/* link to DOF_SECT_ACTDESC */
-	uint32_t dofe_pad;		/* reserved for future use */
-	uint64_t dofe_uarg;		/* user-supplied library argument */
-} dof_ecbdesc_t;
-
-typedef struct dof_probedesc {
-	dof_secidx_t dofp_strtab;	/* link to DOF_SECT_STRTAB section */
-	dof_stridx_t dofp_provider;	/* provider string */
-	dof_stridx_t dofp_mod;		/* module string */
-	dof_stridx_t dofp_func;		/* function string */
-	dof_stridx_t dofp_name;		/* name string */
-	uint32_t dofp_id;		/* probe identifier (or zero) */
-} dof_probedesc_t;
-
-typedef struct dof_actdesc {
-	dof_secidx_t dofa_difo;		/* link to DOF_SECT_DIFOHDR */
-	dof_secidx_t dofa_strtab;	/* link to DOF_SECT_STRTAB section */
-	uint32_t dofa_kind;		/* action kind (DTRACEACT_* constant) */
-	uint32_t dofa_ntuple;		/* number of subsequent tuple actions */
-	uint64_t dofa_arg;		/* kind-specific argument */
-	uint64_t dofa_uarg;		/* user-supplied argument */
-} dof_actdesc_t;
-
-typedef struct dof_difohdr {
-	dtrace_diftype_t dofd_rtype;	/* return type for this fragment */
-	dof_secidx_t dofd_links[1];	/* variable length array of indices */
-} dof_difohdr_t;
-
-typedef struct dof_relohdr {
-	dof_secidx_t dofr_strtab;	/* link to DOF_SECT_STRTAB for names */
-	dof_secidx_t dofr_relsec;	/* link to DOF_SECT_RELTAB for relos */
-	dof_secidx_t dofr_tgtsec;	/* link to section we are relocating */
-} dof_relohdr_t;
-
-typedef struct dof_relodesc {
-	dof_stridx_t dofr_name;		/* string name of relocation symbol */
-	uint32_t dofr_type;		/* relo type (DOF_RELO_* constant) */
-	uint64_t dofr_offset;		/* byte offset for relocation */
-	uint64_t dofr_data;		/* additional type-specific data */
-} dof_relodesc_t;
+	 ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) ||	\
+	 ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) ||		\
+	 ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) ||		\
+	 ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) ||	\
+	 ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) ||	\
+	 ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) ||	\
+	 ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) ||		\
+	 ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) ||		\
+	 ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) ||	\
+	 ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) ||	\
+	 ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
+
+	typedef struct dof_ecbdesc {
+		dof_secidx_t dofe_probes;	/* link to DOF_SECT_PROBEDESC */
+		dof_secidx_t dofe_pred;		/* link to DOF_SECT_DIFOHDR */
+		dof_secidx_t dofe_actions;	/* link to DOF_SECT_ACTDESC */
+		uint32_t dofe_pad;		/* reserved for future use */
+		uint64_t dofe_uarg;		/* user-supplied library argument */
+	} dof_ecbdesc_t;
+
+	typedef struct dof_probedesc {
+		dof_secidx_t dofp_strtab;	/* link to DOF_SECT_STRTAB section */
+		dof_stridx_t dofp_provider;	/* provider string */
+		dof_stridx_t dofp_mod;		/* module string */
+		dof_stridx_t dofp_func;		/* function string */
+		dof_stridx_t dofp_name;		/* name string */
+		uint32_t dofp_id;		/* probe identifier (or zero) */
+	} dof_probedesc_t;
+
+	typedef struct dof_actdesc {
+		dof_secidx_t dofa_difo;		/* link to DOF_SECT_DIFOHDR */
+		dof_secidx_t dofa_strtab;	/* link to DOF_SECT_STRTAB section */
+		uint32_t dofa_kind;		/* action kind (DTRACEACT_* constant) */
+		uint32_t dofa_ntuple;		/* number of subsequent tuple actions */
+		uint64_t dofa_arg;		/* kind-specific argument */
+		uint64_t dofa_uarg;		/* user-supplied argument */
+	} dof_actdesc_t;
+
+	typedef struct dof_difohdr {
+		dtrace_diftype_t dofd_rtype;	/* return type for this fragment */
+		dof_secidx_t dofd_links[1];	/* variable length array of indices */
+	} dof_difohdr_t;
+
+	typedef struct dof_relohdr {
+		dof_secidx_t dofr_strtab;	/* link to DOF_SECT_STRTAB for names */
+		dof_secidx_t dofr_relsec;	/* link to DOF_SECT_RELTAB for relos */
+		dof_secidx_t dofr_tgtsec;	/* link to section we are relocating */
+	} dof_relohdr_t;
+
+	typedef struct dof_relodesc {
+		dof_stridx_t dofr_name;		/* string name of relocation symbol */
+		uint32_t dofr_type;		/* relo type (DOF_RELO_* constant) */
+		uint64_t dofr_offset;		/* byte offset for relocation */
+		uint64_t dofr_data;		/* additional type-specific data */
+	} dof_relodesc_t;
 
 #define	DOF_RELO_NONE	0		/* empty relocation entry */
 #define	DOF_RELO_SETX	1		/* relocate setx value */
 
-typedef struct dof_optdesc {
-	uint32_t dofo_option;		/* option identifier */
-	dof_secidx_t dofo_strtab;	/* string table, if string option */
-	uint64_t dofo_value;		/* option value or string index */
-} dof_optdesc_t;
+	typedef struct dof_optdesc {
+		uint32_t dofo_option;		/* option identifier */
+		dof_secidx_t dofo_strtab;	/* string table, if string option */
+		uint64_t dofo_value;		/* option value or string index */
+	} dof_optdesc_t;
 
-typedef uint32_t dof_attr_t;		/* encoded stability attributes */
+	typedef uint32_t dof_attr_t;		/* encoded stability attributes */
 
 #define	DOF_ATTR(n, d, c)	(((n) << 24) | ((d) << 16) | ((c) << 8))
 #define	DOF_ATTR_NAME(a)	(((a) >> 24) & 0xff)
 #define	DOF_ATTR_DATA(a)	(((a) >> 16) & 0xff)
 #define	DOF_ATTR_CLASS(a)	(((a) >>  8) & 0xff)
 
-typedef struct dof_provider {
-	dof_secidx_t dofpv_strtab;	/* link to DOF_SECT_STRTAB section */
-	dof_secidx_t dofpv_probes;	/* link to DOF_SECT_PROBES section */
-	dof_secidx_t dofpv_prargs;	/* link to DOF_SECT_PRARGS section */
-	dof_secidx_t dofpv_proffs;	/* link to DOF_SECT_PROFFS section */
-	dof_stridx_t dofpv_name;	/* provider name string */
-	dof_attr_t dofpv_provattr;	/* provider attributes */
-	dof_attr_t dofpv_modattr;	/* module attributes */
-	dof_attr_t dofpv_funcattr;	/* function attributes */
-	dof_attr_t dofpv_nameattr;	/* name attributes */
-	dof_attr_t dofpv_argsattr;	/* args attributes */
-	dof_secidx_t dofpv_prenoffs;	/* link to DOF_SECT_PRENOFFS section */
-} dof_provider_t;
-
-typedef struct dof_probe {
-	uint64_t dofpr_addr;		/* probe base address or offset */
-	dof_stridx_t dofpr_func;	/* probe function string */
-	dof_stridx_t dofpr_name;	/* probe name string */
-	dof_stridx_t dofpr_nargv;	/* native argument type strings */
-	dof_stridx_t dofpr_xargv;	/* translated argument type strings */
-	uint32_t dofpr_argidx;		/* index of first argument mapping */
-	uint32_t dofpr_offidx;		/* index of first offset entry */
-	uint8_t dofpr_nargc;		/* native argument count */
-	uint8_t dofpr_xargc;		/* translated argument count */
-	uint16_t dofpr_noffs;		/* number of offset entries for probe */
-	uint32_t dofpr_enoffidx;	/* index of first is-enabled offset */
-	uint16_t dofpr_nenoffs;		/* number of is-enabled offsets */
-	uint16_t dofpr_pad1;		/* reserved for future use */
-	uint32_t dofpr_pad2;		/* reserved for future use */
-} dof_probe_t;
-
-typedef struct dof_xlator {
-	dof_secidx_t dofxl_members;	/* link to DOF_SECT_XLMEMBERS section */
-	dof_secidx_t dofxl_strtab;	/* link to DOF_SECT_STRTAB section */
-	dof_stridx_t dofxl_argv;	/* input parameter type strings */
-	uint32_t dofxl_argc;		/* input parameter list length */
-	dof_stridx_t dofxl_type;	/* output type string name */
-	dof_attr_t dofxl_attr;		/* output stability attributes */
-} dof_xlator_t;
-
-typedef struct dof_xlmember {
-	dof_secidx_t dofxm_difo;	/* member link to DOF_SECT_DIFOHDR */
-	dof_stridx_t dofxm_name;	/* member name */
-	dtrace_diftype_t dofxm_type;	/* member type */
-} dof_xlmember_t;
-
-typedef struct dof_xlref {
-	dof_secidx_t dofxr_xlator;	/* link to DOF_SECT_XLATORS section */
-	uint32_t dofxr_member;		/* index of referenced dof_xlmember */
-	uint32_t dofxr_argn;		/* index of argument for DIF_OP_XLARG */
-} dof_xlref_t;
-
-/*
- * DTrace Intermediate Format Object (DIFO)
- *
- * A DIFO is used to store the compiled DIF for a D expression, its return
- * type, and its string and variable tables.  The string table is a single
- * buffer of character data into which sets instructions and variable
- * references can reference strings using a byte offset.  The variable table
- * is an array of dtrace_difv_t structures that describe the name and type of
- * each variable and the id used in the DIF code.  This structure is described
- * above in the DIF section of this header file.  The DIFO is used at both
- * user-level (in the library) and in the kernel, but the structure is never
- * passed between the two: the DOF structures form the only interface.  As a
- * result, the definition can change depending on the presence of _KERNEL.
- */
-typedef struct dtrace_difo {
-	dif_instr_t *dtdo_buf;		/* instruction buffer */
-	uint64_t *dtdo_inttab;		/* integer table (optional) */
-	char *dtdo_strtab;		/* string table (optional) */
-	dtrace_difv_t *dtdo_vartab;	/* variable table (optional) */
-	uint_t dtdo_len;		/* length of instruction buffer */
-	uint_t dtdo_intlen;		/* length of integer table */
-	uint_t dtdo_strlen;		/* length of string table */
-	uint_t dtdo_varlen;		/* length of variable table */
-	dtrace_diftype_t dtdo_rtype;	/* return type */
-	uint_t dtdo_refcnt;		/* owner reference count */
-	uint_t dtdo_destructive;	/* invokes destructive subroutines */
+	typedef struct dof_provider {
+		dof_secidx_t dofpv_strtab;	/* link to DOF_SECT_STRTAB section */
+		dof_secidx_t dofpv_probes;	/* link to DOF_SECT_PROBES section */
+		dof_secidx_t dofpv_prargs;	/* link to DOF_SECT_PRARGS section */
+		dof_secidx_t dofpv_proffs;	/* link to DOF_SECT_PROFFS section */
+		dof_stridx_t dofpv_name;	/* provider name string */
+		dof_attr_t dofpv_provattr;	/* provider attributes */
+		dof_attr_t dofpv_modattr;	/* module attributes */
+		dof_attr_t dofpv_funcattr;	/* function attributes */
+		dof_attr_t dofpv_nameattr;	/* name attributes */
+		dof_attr_t dofpv_argsattr;	/* args attributes */
+		dof_secidx_t dofpv_prenoffs;	/* link to DOF_SECT_PRENOFFS section */
+	} dof_provider_t;
+
+	typedef struct dof_probe {
+		uint64_t dofpr_addr;		/* probe base address or offset */
+		dof_stridx_t dofpr_func;	/* probe function string */
+		dof_stridx_t dofpr_name;	/* probe name string */
+		dof_stridx_t dofpr_nargv;	/* native argument type strings */
+		dof_stridx_t dofpr_xargv;	/* translated argument type strings */
+		uint32_t dofpr_argidx;		/* index of first argument mapping */
+		uint32_t dofpr_offidx;		/* index of first offset entry */
+		uint8_t dofpr_nargc;		/* native argument count */
+		uint8_t dofpr_xargc;		/* translated argument count */
+		uint16_t dofpr_noffs;		/* number of offset entries for probe */
+		uint32_t dofpr_enoffidx;	/* index of first is-enabled offset */
+		uint16_t dofpr_nenoffs;		/* number of is-enabled offsets */
+		uint16_t dofpr_pad1;		/* reserved for future use */
+		uint32_t dofpr_pad2;		/* reserved for future use */
+	} dof_probe_t;
+
+	typedef struct dof_xlator {
+		dof_secidx_t dofxl_members;	/* link to DOF_SECT_XLMEMBERS section */
+		dof_secidx_t dofxl_strtab;	/* link to DOF_SECT_STRTAB section */
+		dof_stridx_t dofxl_argv;	/* input parameter type strings */
+		uint32_t dofxl_argc;		/* input parameter list length */
+		dof_stridx_t dofxl_type;	/* output type string name */
+		dof_attr_t dofxl_attr;		/* output stability attributes */
+	} dof_xlator_t;
+
+	typedef struct dof_xlmember {
+		dof_secidx_t dofxm_difo;	/* member link to DOF_SECT_DIFOHDR */
+		dof_stridx_t dofxm_name;	/* member name */
+		dtrace_diftype_t dofxm_type;	/* member type */
+	} dof_xlmember_t;
+
+	typedef struct dof_xlref {
+		dof_secidx_t dofxr_xlator;	/* link to DOF_SECT_XLATORS section */
+		uint32_t dofxr_member;		/* index of referenced dof_xlmember */
+		uint32_t dofxr_argn;		/* index of argument for DIF_OP_XLARG */
+	} dof_xlref_t;
+
+	/*
+	 * DTrace Intermediate Format Object (DIFO)
+	 *
+	 * A DIFO is used to store the compiled DIF for a D expression, its return
+	 * type, and its string and variable tables.  The string table is a single
+	 * buffer of character data into which sets instructions and variable
+	 * references can reference strings using a byte offset.  The variable table
+	 * is an array of dtrace_difv_t structures that describe the name and type of
+	 * each variable and the id used in the DIF code.  This structure is described
+	 * above in the DIF section of this header file.  The DIFO is used at both
+	 * user-level (in the library) and in the kernel, but the structure is never
+	 * passed between the two: the DOF structures form the only interface.  As a
+	 * result, the definition can change depending on the presence of _KERNEL.
+	 */
+	typedef struct dtrace_difo {
+		dif_instr_t *dtdo_buf;		/* instruction buffer */
+		uint64_t *dtdo_inttab;		/* integer table (optional) */
+		char *dtdo_strtab;		/* string table (optional) */
+		dtrace_difv_t *dtdo_vartab;	/* variable table (optional) */
+		uint_t dtdo_len;		/* length of instruction buffer */
+		uint_t dtdo_intlen;		/* length of integer table */
+		uint_t dtdo_strlen;		/* length of string table */
+		uint_t dtdo_varlen;		/* length of variable table */
+		dtrace_diftype_t dtdo_rtype;	/* return type */
+		uint_t dtdo_refcnt;		/* owner reference count */
+		uint_t dtdo_destructive;	/* invokes destructive subroutines */
 #ifndef _KERNEL
-	dof_relodesc_t *dtdo_kreltab;	/* kernel relocations */
-	dof_relodesc_t *dtdo_ureltab;	/* user relocations */
-	struct dt_node **dtdo_xlmtab;	/* translator references */
-	uint_t dtdo_krelen;		/* length of krelo table */
-	uint_t dtdo_urelen;		/* length of urelo table */
-	uint_t dtdo_xlmlen;		/* length of translator table */
+		dof_relodesc_t *dtdo_kreltab;	/* kernel relocations */
+		dof_relodesc_t *dtdo_ureltab;	/* user relocations */
+		struct dt_node **dtdo_xlmtab;	/* translator references */
+		uint_t dtdo_krelen;		/* length of krelo table */
+		uint_t dtdo_urelen;		/* length of urelo table */
+		uint_t dtdo_xlmlen;		/* length of translator table */
 #endif
-} dtrace_difo_t;
-
-/*
- * DTrace Enabling Description Structures
- *
- * When DTrace is tracking the description of a DTrace enabling entity (probe,
- * predicate, action, ECB, record, etc.), it does so in a description
- * structure.  These structures all end in "desc", and are used at both
- * user-level and in the kernel -- but (with the exception of
- * dtrace_probedesc_t) they are never passed between them.  Typically,
- * user-level will use the description structures when assembling an enabling.
- * It will then distill those description structures into a DOF object (see
- * above), and send it into the kernel.  The kernel will again use the
- * description structures to create a description of the enabling as it reads
- * the DOF.  When the description is complete, the enabling will be actually
- * created -- turning it into the structures that represent the enabling
- * instead of merely describing it.  Not surprisingly, the description
- * structures bear a strong resemblance to the DOF structures that act as their
- * conduit.
- */
-struct dtrace_predicate;
-
-typedef struct dtrace_probedesc {
-	dtrace_id_t dtpd_id;			/* probe identifier */
-	char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */
-	char dtpd_mod[DTRACE_MODNAMELEN];	/* probe module name */
-	char dtpd_func[DTRACE_FUNCNAMELEN];	/* probe function name */
-	char dtpd_name[DTRACE_NAMELEN];		/* probe name */
-} dtrace_probedesc_t;
-
-typedef struct dtrace_repldesc {
-	dtrace_probedesc_t dtrpd_match;		/* probe descr. to match */
-	dtrace_probedesc_t dtrpd_create;	/* probe descr. to create */
-} dtrace_repldesc_t;
-
-typedef struct dtrace_preddesc {
-	dtrace_difo_t *dtpdd_difo;		/* pointer to DIF object */
-	struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */
-} dtrace_preddesc_t;
-
-typedef struct dtrace_actdesc {
-	dtrace_difo_t *dtad_difo;		/* pointer to DIF object */
-	struct dtrace_actdesc *dtad_next;	/* next action */
-	dtrace_actkind_t dtad_kind;		/* kind of action */
-	uint32_t dtad_ntuple;			/* number in tuple */
-	uint64_t dtad_arg;			/* action argument */
-	uint64_t dtad_uarg;			/* user argument */
-	int dtad_refcnt;			/* reference count */
-} dtrace_actdesc_t;
-
-typedef struct dtrace_ecbdesc {
-	dtrace_actdesc_t *dted_action;		/* action description(s) */
-	dtrace_preddesc_t dted_pred;		/* predicate description */
-	dtrace_probedesc_t dted_probe;		/* probe description */
-	uint64_t dted_uarg;			/* library argument */
-	int dted_refcnt;			/* reference count */
-} dtrace_ecbdesc_t;
-
-/*
- * DTrace Metadata Description Structures
- *
- * DTrace separates the trace data stream from the metadata stream.  The only
- * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
- * timestamp) or (in the case of aggregations) aggregation identifiers.  To
- * determine the structure of the data, DTrace consumers pass the token to the
- * kernel, and receive in return a corresponding description of the enabled
- * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
- * dtrace_aggdesc structure).  Both of these structures are expressed in terms
- * of record descriptions (via the dtrace_recdesc structure) that describe the
- * exact structure of the data.  Some record descriptions may also contain a
- * format identifier; this additional bit of metadata can be retrieved from the
- * kernel, for which a format description is returned via the dtrace_fmtdesc
- * structure.  Note that all four of these structures must be bitness-neutral
- * to allow for a 32-bit DTrace consumer on a 64-bit kernel.
- */
-typedef struct dtrace_recdesc {
-	dtrace_actkind_t dtrd_action;		/* kind of action */
-	uint32_t dtrd_size;			/* size of record */
-	uint32_t dtrd_offset;			/* offset in ECB's data */
-	uint16_t dtrd_alignment;		/* required alignment */
-	uint16_t dtrd_format;			/* format, if any */
-	uint64_t dtrd_arg;			/* action argument */
-	uint64_t dtrd_uarg;			/* user argument */
-} dtrace_recdesc_t;
-
-typedef struct dtrace_eprobedesc {
-	dtrace_epid_t dtepd_epid;		/* enabled probe ID */
-	dtrace_id_t dtepd_probeid;		/* probe ID */
-	uint64_t dtepd_uarg;			/* library argument */
-	uint32_t dtepd_size;			/* total size */
-	int dtepd_nrecs;			/* number of records */
-	dtrace_recdesc_t dtepd_rec[1];		/* records themselves */
-} dtrace_eprobedesc_t;
-
-typedef struct dtrace_aggdesc {
-	DTRACE_PTR(char, dtagd_name);		/* not filled in by kernel */
-	dtrace_aggvarid_t dtagd_varid;		/* not filled in by kernel */
-	int dtagd_flags;			/* not filled in by kernel */
-	dtrace_aggid_t dtagd_id;		/* aggregation ID */
-	dtrace_epid_t dtagd_epid;		/* enabled probe ID */
-	uint32_t dtagd_size;			/* size in bytes */
-	int dtagd_nrecs;			/* number of records */
-	uint32_t dtagd_pad;			/* explicit padding */
-	dtrace_recdesc_t dtagd_rec[1];		/* record descriptions */
-} dtrace_aggdesc_t;
-
-typedef struct dtrace_fmtdesc {
-	DTRACE_PTR(char, dtfd_string);		/* format string */
-	int dtfd_length;			/* length of format string */
-	uint16_t dtfd_format;			/* format identifier */
-} dtrace_fmtdesc_t;
+	} dtrace_difo_t;
+
+	/*
+	 * DTrace Enabling Description Structures
+	 *
+	 * When DTrace is tracking the description of a DTrace enabling entity (probe,
+	 * predicate, action, ECB, record, etc.), it does so in a description
+	 * structure.  These structures all end in "desc", and are used at both
+	 * user-level and in the kernel -- but (with the exception of
+	 * dtrace_probedesc_t) they are never passed between them.  Typically,
+	 * user-level will use the description structures when assembling an enabling.
+	 * It will then distill those description structures into a DOF object (see
+	 * above), and send it into the kernel.  The kernel will again use the
+	 * description structures to create a description of the enabling as it reads
+	 * the DOF.  When the description is complete, the enabling will be actually
+	 * created -- turning it into the structures that represent the enabling
+	 * instead of merely describing it.  Not surprisingly, the description
+	 * structures bear a strong resemblance to the DOF structures that act as their
+	 * conduit.
+	 */
+	struct dtrace_predicate;
+
+	typedef struct dtrace_probedesc {
+		dtrace_id_t dtpd_id;			/* probe identifier */
+		char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */
+		char dtpd_mod[DTRACE_MODNAMELEN];	/* probe module name */
+		char dtpd_func[DTRACE_FUNCNAMELEN];	/* probe function name */
+		char dtpd_name[DTRACE_NAMELEN];		/* probe name */
+	} dtrace_probedesc_t;
+
+	typedef struct dtrace_repldesc {
+		dtrace_probedesc_t dtrpd_match;		/* probe descr. to match */
+		dtrace_probedesc_t dtrpd_create;	/* probe descr. to create */
+	} dtrace_repldesc_t;
+
+	typedef struct dtrace_preddesc {
+		dtrace_difo_t *dtpdd_difo;		/* pointer to DIF object */
+		struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */
+	} dtrace_preddesc_t;
+
+	typedef struct dtrace_actdesc {
+		dtrace_difo_t *dtad_difo;		/* pointer to DIF object */
+		struct dtrace_actdesc *dtad_next;	/* next action */
+		dtrace_actkind_t dtad_kind;		/* kind of action */
+		uint32_t dtad_ntuple;			/* number in tuple */
+		uint64_t dtad_arg;			/* action argument */
+		uint64_t dtad_uarg;			/* user argument */
+		int dtad_refcnt;			/* reference count */
+	} dtrace_actdesc_t;
+
+	typedef struct dtrace_ecbdesc {
+		dtrace_actdesc_t *dted_action;		/* action description(s) */
+		dtrace_preddesc_t dted_pred;		/* predicate description */
+		dtrace_probedesc_t dted_probe;		/* probe description */
+		uint64_t dted_uarg;			/* library argument */
+		int dted_refcnt;			/* reference count */
+	} dtrace_ecbdesc_t;
+
+	/*
+	 * DTrace Metadata Description Structures
+	 *
+	 * DTrace separates the trace data stream from the metadata stream.  The only
+	 * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+	 * timestamp) or (in the case of aggregations) aggregation identifiers.  To
+	 * determine the structure of the data, DTrace consumers pass the token to the
+	 * kernel, and receive in return a corresponding description of the enabled
+	 * probe (via the dtrace_eprobedesc structure) or the aggregation (via the
+	 * dtrace_aggdesc structure).  Both of these structures are expressed in terms
+	 * of record descriptions (via the dtrace_recdesc structure) that describe the
+	 * exact structure of the data.  Some record descriptions may also contain a
+	 * format identifier; this additional bit of metadata can be retrieved from the
+	 * kernel, for which a format description is returned via the dtrace_fmtdesc
+	 * structure.  Note that all four of these structures must be bitness-neutral
+	 * to allow for a 32-bit DTrace consumer on a 64-bit kernel.
+	 */
+	typedef struct dtrace_recdesc {
+		dtrace_actkind_t dtrd_action;		/* kind of action */
+		uint32_t dtrd_size;			/* size of record */
+		uint32_t dtrd_offset;			/* offset in ECB's data */
+		uint16_t dtrd_alignment;		/* required alignment */
+		uint16_t dtrd_format;			/* format, if any */
+		uint64_t dtrd_arg;			/* action argument */
+		uint64_t dtrd_uarg;			/* user argument */
+	} dtrace_recdesc_t;
+
+	typedef struct dtrace_eprobedesc {
+		dtrace_epid_t dtepd_epid;		/* enabled probe ID */
+		dtrace_id_t dtepd_probeid;		/* probe ID */
+		uint64_t dtepd_uarg;			/* library argument */
+		uint32_t dtepd_size;			/* total size */
+		int dtepd_nrecs;			/* number of records */
+		dtrace_recdesc_t dtepd_rec[1];		/* records themselves */
+	} dtrace_eprobedesc_t;
+
+	typedef struct dtrace_aggdesc {
+		DTRACE_PTR(char, dtagd_name);		/* not filled in by kernel */
+		dtrace_aggvarid_t dtagd_varid;		/* not filled in by kernel */
+		int dtagd_flags;			/* not filled in by kernel */
+		dtrace_aggid_t dtagd_id;		/* aggregation ID */
+		dtrace_epid_t dtagd_epid;		/* enabled probe ID */
+		uint32_t dtagd_size;			/* size in bytes */
+		int dtagd_nrecs;			/* number of records */
+		uint32_t dtagd_pad;			/* explicit padding */
+		dtrace_recdesc_t dtagd_rec[1];		/* record descriptions */
+	} dtrace_aggdesc_t;
+
+	typedef struct dtrace_fmtdesc {
+		DTRACE_PTR(char, dtfd_string);		/* format string */
+		int dtfd_length;			/* length of format string */
+		uint16_t dtfd_format;			/* format identifier */
+	} dtrace_fmtdesc_t;
 
 #define	DTRACE_SIZEOF_EPROBEDESC(desc)				\
 	(sizeof (dtrace_eprobedesc_t) + ((desc)->dtepd_nrecs ?	\
-	(((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
+																	 (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
 
 #define	DTRACE_SIZEOF_AGGDESC(desc)				\
 	(sizeof (dtrace_aggdesc_t) + ((desc)->dtagd_nrecs ?	\
-	(((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
-
-/*
- * DTrace Option Interface
- *
- * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections
- * in a DOF image.  The dof_optdesc structure contains an option identifier and
- * an option value.  The valid option identifiers are found below; the mapping
- * between option identifiers and option identifying strings is maintained at
- * user-level.  Note that the value of DTRACEOPT_UNSET is such that all of the
- * following are potentially valid option values:  all positive integers, zero
- * and negative one.  Some options (notably "bufpolicy" and "bufresize") take
- * predefined tokens as their values; these are defined with
- * DTRACEOPT_{option}_{token}.
- */
+																(((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0))
+
+	/*
+	 * DTrace Option Interface
+	 *
+	 * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections
+	 * in a DOF image.  The dof_optdesc structure contains an option identifier and
+	 * an option value.  The valid option identifiers are found below; the mapping
+	 * between option identifiers and option identifying strings is maintained at
+	 * user-level.  Note that the value of DTRACEOPT_UNSET is such that all of the
+	 * following are potentially valid option values:  all positive integers, zero
+	 * and negative one.  Some options (notably "bufpolicy" and "bufresize") take
+	 * predefined tokens as their values; these are defined with
+	 * DTRACEOPT_{option}_{token}.
+	 */
 #define	DTRACEOPT_BUFSIZE	0	/* buffer size */
 #define	DTRACEOPT_BUFPOLICY	1	/* buffer policy */
 #define	DTRACEOPT_DYNVARSIZE	2	/* dynamic variable size */
@@ -1019,7 +1020,8 @@ typedef struct dtrace_fmtdesc {
 #define	DTRACEOPT_AGGSORTPOS	25	/* agg. position to sort on */
 #define	DTRACEOPT_AGGSORTKEYPOS	26	/* agg. key position to sort on */
 #define	DTRACEOPT_TEMPORAL	27	/* temporally ordered output */
-#define	DTRACEOPT_MAX		28	/* number of options */
+#define	DTRACEOPT_ZONE		28	/* zone in which to enable probes */
+#define	DTRACEOPT_MAX		29	/* number of options */
 
 #define	DTRACEOPT_UNSET		(dtrace_optval_t)-2	/* unset option */
 
@@ -1649,14 +1651,21 @@ typedef struct dof_helper {
  *
  *   A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL
  *   or DTRACE_MODE_USER) and the policy when the privilege of the enabling
- *   is insufficient for that mode (either DTRACE_MODE_NOPRIV_DROP or
- *   DTRACE_MODE_NOPRIV_RESTRICT).  If the policy is DTRACE_MODE_NOPRIV_DROP,
- *   insufficient privilege will result in the probe firing being silently
- *   ignored for the enabling; if the policy is DTRACE_NODE_NOPRIV_RESTRICT,
- *   insufficient privilege will not prevent probe processing for the
- *   enabling, but restrictions will be in place that induce a UPRIV fault
- *   upon attempt to examine probe arguments or current process state.
- *
+ *   is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP,
+ *   DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT).  If
+ *   DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result
+ *   in the probe firing being silently ignored for the enabling; if the
+ *   DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not
+ *   prevent probe processing for the enabling, but restrictions will be in
+ *   place that induce a UPRIV fault upon attempt to examine probe arguments
+ *   or current process state.  If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit
+ *   is set, similar restrictions will be placed upon operation if the
+ *   privilege is sufficient to process the enabling, but does not otherwise
+ *   entitle the enabling to all zones.  The DTRACE_MODE_NOPRIV_DROP and
+ *   DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these
+ *   two policies must be specified), but either may be combined (or not)
+ *   with DTRACE_MODE_LIMITEDPRIV_RESTRICT.
+ *   
  * 1.10.4  Caller's context
  *
  *   This is called from within dtrace_probe() meaning that interrupts
@@ -2054,6 +2063,7 @@ typedef struct dtrace_pops {
 #define	DTRACE_MODE_USER			0x02
 #define	DTRACE_MODE_NOPRIV_DROP			0x10
 #define	DTRACE_MODE_NOPRIV_RESTRICT		0x20
+#define	DTRACE_MODE_LIMITEDPRIV_RESTRICT	0x40
 
 typedef uintptr_t	dtrace_provider_id_t;
 
@@ -2268,6 +2278,7 @@ extern void (*dtrace_helpers_cleanup)();
 extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child);
 extern void (*dtrace_cpustart_init)();
 extern void (*dtrace_cpustart_fini)();
+extern void (*dtrace_closef)();
 
 extern void (*dtrace_debugger_init)();
 extern void (*dtrace_debugger_fini)();
diff --git a/usr/src/uts/common/sys/dtrace_impl.h b/usr/src/uts/common/sys/dtrace_impl.h
index d780082137..f79bf1e42e 100644
--- a/usr/src/uts/common/sys/dtrace_impl.h
+++ b/usr/src/uts/common/sys/dtrace_impl.h
@@ -924,6 +924,7 @@ typedef struct dtrace_mstate {
 	uintptr_t dtms_strtok;			/* saved strtok() pointer */
 	uint32_t dtms_access;			/* memory access rights */
 	dtrace_difo_t *dtms_difo;		/* current dif object */
+	file_t *dtms_getf;			/* cached rval of getf() */
 } dtrace_mstate_t;
 
 #define	DTRACE_COND_OWNER	0x1
@@ -1144,6 +1145,7 @@ struct dtrace_state {
 	dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */
 	dtrace_cred_t dts_cred;			/* credentials */
 	size_t dts_nretained;			/* number of retained enabs */
+	int dts_getf;				/* number of getf() calls */
 };
 
 struct dtrace_provider {
diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h
new file mode 100644
index 0000000000..b8c4149df2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#ifndef	_SYS_FS_HYPRLOFS_H
+#define	_SYS_FS_HYPRLOFS_H
+
+#include <sys/param.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * hyprlofs ioctl numbers.
+ */
+#define	HYPRLOFS_IOC	('H' << 8)
+
+#define	HYPRLOFS_ADD_ENTRIES	(HYPRLOFS_IOC | 1)
+#define	HYPRLOFS_RM_ENTRIES	(HYPRLOFS_IOC | 2)
+#define	HYPRLOFS_RM_ALL		(HYPRLOFS_IOC | 3)
+#define	HYPRLOFS_GET_ENTRIES	(HYPRLOFS_IOC | 4)
+
+typedef struct {
+	char	*hle_path;
+	uint_t	hle_plen;
+	char	*hle_name;
+	uint_t	hle_nlen;
+} hyprlofs_entry_t;
+
+typedef struct {
+	hyprlofs_entry_t	*hle_entries;
+	uint_t			hle_len;
+} hyprlofs_entries_t;
+
+typedef struct {
+	char		hce_path[MAXPATHLEN];
+	char		hce_name[MAXPATHLEN];
+} hyprlofs_curr_entry_t;
+
+typedef struct {
+	hyprlofs_curr_entry_t	*hce_entries;
+	uint_t			hce_cnt;
+} hyprlofs_curr_entries_t;
+
+#ifdef _KERNEL
+typedef struct {
+	caddr32_t	hle_path;
+	uint_t		hle_plen;
+	caddr32_t	hle_name;
+	uint_t		hle_nlen;
+} hyprlofs_entry32_t;
+
+typedef struct {
+	caddr32_t	hle_entries;
+	uint_t		hle_len;
+} hyprlofs_entries32_t;
+
+typedef struct {
+	caddr32_t	hce_entries;
+	uint_t		hce_cnt;
+} hyprlofs_curr_entries32_t;
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_HYPRLOFS_H */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h
new file mode 100644
index 0000000000..29bdadc4e2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_HLOFS_INFO_H
+#define	_SYS_FS_HLOFS_INFO_H
+
+#include <sys/t_lock.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <sys/vfs_opreg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * hlnode is the file system dependent node for hyprlofs.
+ * It is modeled on the tmpfs tmpnode.
+ *
+ *	hln_rwlock protects access of the directory list at hln_dir
+ *	as well as syncronizing read/writes to directory hlnodes.
+ *	hln_tlock protects updates to hln_mode and hln_nlink.
+ *	hln_tlock doesn't require any hlnode locks.
+ */
+typedef struct hlnode {
+	struct hlnode	*hln_back;		/* linked list of hlnodes */
+	struct hlnode	*hln_forw;		/* linked list of hlnodes */
+	union {
+		struct {
+			struct hldirent	*un_dirlist; /* dirent list */
+			uint_t	un_dirents;	/* number of dirents */
+		} un_dirstruct;
+		vnode_t	*un_realvp;		/* real vnode */
+	} un_hlnode;
+	vnode_t 	*hln_vnode;		/* vnode for this hlnode */
+	int 		hln_gen;		/* pseudo gen num for hlfid */
+	int 		hln_looped;		/* flag indicating loopback */
+	vattr_t		hln_attr;		/* attributes */
+	krwlock_t	hln_rwlock;		/* rw - serialize mods and */
+						/* directory updates */
+	kmutex_t	hln_tlock;		/* time, flag, and nlink lock */
+} hlnode_t;
+
+/*
+ * hyprlofs per-mount data structure.
+ * All fields are protected by hlm_contents.
+ */
+typedef struct {
+	vfs_t		*hlm_vfsp;	/* filesystem's vfs struct */
+	hlnode_t	*hlm_rootnode;	/* root hlnode */
+	char 		*hlm_mntpath;	/* name of hyprlofs mount point */
+	dev_t		hlm_dev;	/* unique dev # of mounted `device' */
+	uint_t		hlm_gen;	/* pseudo generation number for files */
+	kmutex_t	hlm_contents;	/* lock for hlfsmount structure */
+} hlfsmount_t;
+
+/*
+ * hyprlofs directories are made up of a linked list of hldirent structures
+ * hanging off directory hlnodes.  File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct hldirent {
+	hlnode_t	*hld_hlnode;		/* hlnode for this file */
+	struct hldirent	*hld_next;		/* next directory entry */
+	struct hldirent	*hld_prev;		/* prev directory entry */
+	uint_t		hld_offset;		/* "offset" of dir entry */
+	uint_t		hld_hash;		/* a hash of td_name */
+	struct hldirent	*hld_link;		/* linked via the hash table */
+	hlnode_t	*hld_parent;		/* parent, dir we are in */
+	char		*hld_name;		/* must be null terminated */
+						/* max length is MAXNAMELEN */
+} hldirent_t;
+
+/*
+ * hlfid overlays the fid structure (for VFS_VGET)
+ */
+typedef struct {
+	uint16_t hlfid_len;
+	ino32_t	hlfid_ino;
+	int32_t	hlfid_gen;
+} hlfid_t;
+
+/*
+ * File system independent to hyprlofs conversion macros
+ */
+#define	VFSTOHLM(vfsp)		((hlfsmount_t *)(vfsp)->vfs_data)
+#define	VTOHLM(vp)		((hlfsmount_t *)(vp)->v_vfsp->vfs_data)
+#define	VTOHLN(vp)		((hlnode_t *)(vp)->v_data)
+#define	HLNTOV(tp)		((tp)->hln_vnode)
+#define	REALVP(vp)		((vnode_t *)VTOHLN(vp)->hln_realvp)
+#define	hlnode_hold(tp)		VN_HOLD(HLNTOV(tp))
+#define	hlnode_rele(tp)		VN_RELE(HLNTOV(tp))
+
+#define	hln_dir		un_hlnode.un_dirstruct.un_dirlist
+#define	hln_dirents	un_hlnode.un_dirstruct.un_dirents
+#define	hln_realvp	un_hlnode.un_realvp
+
+/*
+ * Attributes
+ */
+#define	hln_mask	hln_attr.va_mask
+#define	hln_type	hln_attr.va_type
+#define	hln_mode	hln_attr.va_mode
+#define	hln_uid		hln_attr.va_uid
+#define	hln_gid		hln_attr.va_gid
+#define	hln_fsid	hln_attr.va_fsid
+#define	hln_nodeid	hln_attr.va_nodeid
+#define	hln_nlink	hln_attr.va_nlink
+#define	hln_size	hln_attr.va_size
+#define	hln_atime	hln_attr.va_atime
+#define	hln_mtime	hln_attr.va_mtime
+#define	hln_ctime	hln_attr.va_ctime
+#define	hln_rdev	hln_attr.va_rdev
+#define	hln_blksize	hln_attr.va_blksize
+#define	hln_nblocks	hln_attr.va_nblocks
+#define	hln_seq		hln_attr.va_seq
+
+#define	HL_MUSTHAVE	1
+
+/*
+ * enums
+ */
+enum de_op	{ DE_CREATE, DE_MKDIR }; /* direnter ops */
+enum dr_op	{ DR_REMOVE, DR_RMDIR }; /* dirremove ops */
+
+/*
+ * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs
+ * leaves free for the rest of the system. The default value for
+ * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a
+ * different number of pages.  Since hyprlofs doesn't actually use much
+ * memory, its unlikely this ever needs to be patched.
+ */
+#define	HYPRLOFSMINFREE	8 * 1024 * 1024	/* 8 Megabytes */
+
+extern size_t	hyprlofs_minfree;		/* Anonymous memory in pages */
+
+/*
+ * hyprlofs can allocate only a certain percentage of kernel memory,
+ * which is used for hlnodes, directories, file names, etc.
+ * This is statically set as HYPRLOFSMAXFRACKMEM of physical memory.
+ * The actual number of allocatable bytes can be patched in hyprlofs_maxkmem.
+ */
+#define	HYPRLOFSMAXFRACKMEM	25	/* 1/25 of physical memory */
+
+extern size_t 	hyprlofs_kmemspace;
+extern size_t	hyprlofs_maxkmem; /* Allocatable kernel memory in bytes */
+
+extern	void	hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *,
+		    cred_t *);
+extern	int	hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *);
+extern	int	hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op,
+		    cred_t *);
+extern	void	hyprlofs_dirinit(hlnode_t *, hlnode_t *);
+extern	void	hyprlofs_dirtrunc(hlnode_t *);
+extern	void	*hyprlofs_memalloc(size_t, int);
+extern	void	hyprlofs_memfree(void *, size_t);
+extern	int	hyprlofs_taccess(void *, int, cred_t *);
+extern	int	hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op,
+		    vnode_t *, vattr_t *, hlnode_t **, cred_t *);
+
+extern struct vnodeops *hyprlofs_vnodeops;
+extern const struct fs_operation_def hyprlofs_vnodeops_template[];
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_HLOFS_INFO_H */
diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h
index 583586fd75..03c35277d4 100644
--- a/usr/src/uts/common/sys/fss.h
+++ b/usr/src/uts/common/sys/fss.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_FSS_H
@@ -86,6 +87,7 @@ typedef struct fsspset {
 					/* on the list			*/
 	struct fssproj	*fssps_list;	/* list of project parts	*/
 	struct fsszone	*fssps_zones;	/* list of fsszone_t's in pset	*/
+	uint32_t	fssps_gen;	/* generation for zone's kstats */
 } fsspset_t;
 
 /*
@@ -101,7 +103,10 @@ typedef struct fssproj {
 					/* protected by fssps_lock	*/
 	uint32_t	fssp_shares;	/* copy of our kpj_shares	*/
 					/* protected by fssps_displock	*/
-	uint32_t	fssp_ticks;	/* total of all ticks		*/
+	uint32_t	fssp_ticks;	/* total of nice tick values	*/
+					/* protected by fssps_displock	*/
+	uint32_t	fssp_tick_cnt;	/* cnt of all ticks in this sec	*/
+	uint32_t	fssp_shr_pct;	/* active shr % in this sec	*/
 					/* protected by fssps_displock	*/
 	fssusage_t	fssp_usage;	/* this project's decayed usage */
 	fssusage_t	fssp_shusage;	/* normalized usage		*/
diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h
new file mode 100644
index 0000000000..901e74f44c
--- /dev/null
+++ b/usr/src/uts/common/sys/ipd.h
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * These definitions are private to ipd and ipdadm.
+ */
+
+#ifndef _SYS_IPD_H
+#define	_SYS_IPD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	IPD_DEV_PATH	"/dev/ipd"
+#define	IPD_MAX_DELAY	1000000		/* 1s in us */
+
+typedef struct ipd_ioc_perturb {
+	zoneid_t	ipip_zoneid;
+	uint32_t	ipip_arg;
+} ipd_ioc_perturb_t;
+
+typedef struct ipd_ioc_info {
+	zoneid_t	ipii_zoneid;
+	uint32_t	ipii_corrupt;
+	uint32_t	ipii_drop;
+	uint32_t	ipii_delay;
+} ipd_ioc_info_t;
+
+#ifdef _KERNEL
+
+typedef struct ipd_ioc_list32 {
+	uint_t		ipil_nzones;
+	caddr32_t	ipil_info;
+} ipd_ioc_list32_t;
+
+#endif /* _KERNEL */
+
+typedef struct ipd_ioc_list {
+	uint_t		ipil_nzones;
+	ipd_ioc_info_t	*ipil_info;
+} ipd_ioc_list_t;
+
+#define	IPD_CORRUPT	0x1
+#define	IPD_DELAY	0x2
+#define	IPD_DROP	0x4
+
+#define	IPDIOC		(('i' << 24) | ('p' << 16) | ('d' << 8))
+#define	IPDIOC_CORRUPT	(IPDIOC | 1)		/* disable ipd */
+#define	IPDIOC_DELAY	(IPDIOC | 2)		/* disable ipd */
+#define	IPDIOC_DROP	(IPDIOC | 3)		/* disable ipd */
+#define	IPDIOC_LIST	(IPDIOC | 4)		/* enable ipd */
+#define	IPDIOC_REMOVE	(IPDIOC | 5)		/* disable ipd */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_IPD_H */
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 6876fccb1a..220446af65 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_MAC_H
@@ -205,6 +206,7 @@ typedef enum {
 	MAC_PROP_MAX_RXHWCLNT_AVAIL,
 	MAC_PROP_MAX_TXHWCLNT_AVAIL,
 	MAC_PROP_IB_LINKMODE,
+	MAC_PROP_VN_PROMISC_FILTERED,
 	MAC_PROP_PRIVATE = -1
 } mac_prop_id_t;
 
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
index ae25df6a0d..ec49527300 100644
--- a/usr/src/uts/common/sys/mac_client_impl.h
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_MAC_CLIENT_IMPL_H
@@ -302,6 +303,7 @@ extern	int	mac_tx_percpu_cnt;
 
 /* Mac protection flags */
 #define	MPT_FLAG_V6_LOCAL_ADDR_SET	0x0001
+#define	MPT_FLAG_PROMISC_FILTERED	0x0002
 
 /* in mac_client.c */
 extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 8f9f23ff71..2eef66113d 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_MAC_IMPL_H
@@ -885,6 +886,8 @@ extern void mac_protect_fini(mac_client_impl_t *);
 extern int mac_set_resources(mac_handle_t, mac_resource_props_t *);
 extern void mac_get_resources(mac_handle_t, mac_resource_props_t *);
 extern void mac_get_effective_resources(mac_handle_t, mac_resource_props_t *);
+extern void mac_set_promisc_filtered(mac_client_handle_t, boolean_t);
+extern boolean_t mac_get_promisc_filtered(mac_client_handle_t);
 
 extern cpupart_t *mac_pset_find(mac_resource_props_t *, boolean_t *);
 extern void mac_set_pool_effective(boolean_t, cpupart_t *,
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 6c9119e56d..82344607b0 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -22,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -353,6 +354,7 @@ struct memcntl_mha32 {
 #define	MS_SYNC		0x4		/* wait for msync */
 #define	MS_ASYNC	0x1		/* return immediately */
 #define	MS_INVALIDATE	0x2		/* invalidate caches */
+#define	MS_INVALCURPROC	0x8		/* invalidate cache for curproc only */
 
 #if	(_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
 /* functions to mctl */
diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h
index e95ef3fccc..d215d88790 100644
--- a/usr/src/uts/common/sys/mntent.h
+++ b/usr/src/uts/common/sys/mntent.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
  *
  *	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
  *		All Rights Reserved
@@ -47,6 +48,7 @@ extern "C" {
 #define	MNTTYPE_PCFS	"pcfs"		/* PC (MSDOS) file system */
 #define	MNTTYPE_PC	MNTTYPE_PCFS	/* Deprecated name; use MNTTYPE_PCFS */
 #define	MNTTYPE_LOFS	"lofs"		/* Loop back file system */
+#define	MNTTYPE_HYPRLOFS "hyprlofs"	/* Hyperlofs file system */
 #define	MNTTYPE_LO	MNTTYPE_LOFS	/* Deprecated name; use MNTTYPE_LOFS */
 #define	MNTTYPE_HSFS	"hsfs"		/* High Sierra (9660) file system */
 #define	MNTTYPE_SWAP	"swap"		/* Swap file system */
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index bcd5ba2b4c..819c788b9e 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_SYS_POLICY_H
@@ -171,6 +172,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
     const vattr_t *, cred_t *);
 int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
 int secpolicy_xvm_control(const cred_t *);
+int secpolicy_hyprlofs_control(const cred_t *);
 
 int secpolicy_basic_exec(const cred_t *, vnode_t *);
 int secpolicy_basic_fork(const cred_t *);
diff --git a/usr/src/uts/common/sys/port.h b/usr/src/uts/common/sys/port.h
index ccb0308255..d4d74d55ea 100644
--- a/usr/src/uts/common/sys/port.h
+++ b/usr/src/uts/common/sys/port.h
@@ -24,11 +24,13 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #ifndef	_SYS_PORT_H
 #define	_SYS_PORT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -106,6 +108,7 @@ typedef	struct	port_notify32 {
 #define	FILE_ACCESS		0x00000001
 #define	FILE_MODIFIED		0x00000002
 #define	FILE_ATTRIB		0x00000004
+#define	FILE_TRUNC		0x00100000
 #define	FILE_NOFOLLOW		0x10000000
 
 /*
diff --git a/usr/src/uts/common/sys/port_impl.h b/usr/src/uts/common/sys/port_impl.h
index 9f3f291874..504fb9ece1 100644
--- a/usr/src/uts/common/sys/port_impl.h
+++ b/usr/src/uts/common/sys/port_impl.h
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #ifndef	_SYS_PORT_IMPL_H
 #define	_SYS_PORT_IMPL_H
 
@@ -311,6 +315,7 @@ typedef struct portfop_vp {
 #define	FOP_FILE_SETATTR_MTIME	0x00080000
 #define	FOP_FILE_SETATTR_CTIME	0x00100000
 #define	FOP_FILE_LINK_SRC	0x00200000
+#define	FOP_FILE_TRUNC		0x00400000
 
 /*
  * File modification event.
@@ -339,10 +344,15 @@ typedef struct portfop_vp {
 
 
 /*
+ * File trunc event
+ */
+#define	FOP_TRUNC_MASK		(FOP_FILE_TRUNC|FOP_FILE_CREATE)
+
+/*
  * valid watchable events
  */
 #define	FILE_EVENTS_MASK	(FILE_ACCESS|FILE_MODIFIED|FILE_ATTRIB \
-				|FILE_NOFOLLOW)
+				|FILE_NOFOLLOW|FILE_TRUNC)
 /* --- End file events --- */
 
 /*
diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h
index 0c4a48fcdd..f592fd9dcf 100644
--- a/usr/src/uts/common/sys/procfs.h
+++ b/usr/src/uts/common/sys/procfs.h
@@ -65,10 +65,6 @@ extern "C" {
 #include <sys/stat.h>
 #include <sys/param.h>
 
-#if !defined(_LP64) && _FILE_OFFSET_BITS == 64
-#error	"Cannot use procfs in the large file compilation environment"
-#endif
-
 /*
  * System call interfaces for /proc.
  */
diff --git a/usr/src/uts/common/sys/sdt_impl.h b/usr/src/uts/common/sys/sdt_impl.h
index cbe95f7c66..f7cc683f2f 100644
--- a/usr/src/uts/common/sys/sdt_impl.h
+++ b/usr/src/uts/common/sys/sdt_impl.h
@@ -24,11 +24,13 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #ifndef _SYS_SDT_IMPL_H
 #define	_SYS_SDT_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -45,6 +47,7 @@ typedef struct sdt_provider {
 	char			*sdtp_name;	/* name of provider */
 	char			*sdtp_prefix;	/* prefix for probe names */
 	dtrace_pattr_t		*sdtp_attr;	/* stability attributes */
+	uint32_t		sdtp_priv;	/* privilege, if any */
 	dtrace_provider_id_t	sdtp_id;	/* provider ID */
 } sdt_provider_t;
 
@@ -75,6 +78,7 @@ typedef struct sdt_argdesc {
 } sdt_argdesc_t;
 
 extern void sdt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
+extern int sdt_mode(void *, dtrace_id_t, void *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index 188230d61e..c7f460e7c7 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -68,6 +68,8 @@ typedef struct ctxop {
 	void	(*free_op)(void *, int); /* function which frees the context */
 	void	*arg;		/* argument to above functions, ctx pointer */
 	struct ctxop *next;	/* next context ops */
+	hrtime_t save_ts;		/* timestamp of last save */
+	hrtime_t restore_ts;		/* timestamp of last restore */
 } ctxop_t;
 
 /*
diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h
index c35d0a5cfb..6adeb477bb 100644
--- a/usr/src/uts/common/sys/uadmin.h
+++ b/usr/src/uts/common/sys/uadmin.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -157,7 +158,7 @@ extern kmutex_t ualock;
 extern void mdboot(int, int, char *, boolean_t);
 extern void mdpreboot(int, int, char *);
 extern int kadmin(int, int, void *, cred_t *);
-extern void killall(zoneid_t);
+extern void killall(zoneid_t, boolean_t);
 #endif
 
 #if defined(__STDC__)
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
index 1aa4a8ee6d..97e3430ae2 100644
--- a/usr/src/uts/common/sys/vm_usage.h
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_VM_USAGE_H
@@ -79,8 +80,9 @@ extern "C" {
 					/* zoneid */
 #define	VMUSAGE_COL_EUSERS	0x2000	/* same as VMUSAGE_COL_RUSERS, but by */
 					/* euser */
+#define	VMUSAGE_A_ZONE		0x4000	/* rss/swap for a specified zone */
 
-#define	VMUSAGE_MASK		0x3fff  /* all valid flags for getvmusage() */
+#define	VMUSAGE_MASK		0x7fff  /* all valid flags for getvmusage() */
 
 typedef struct vmusage {
 	id_t	vmu_zoneid;		/* zoneid, or ALL_ZONES for */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 3ba7bf47f4..a44930c853 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_ZONE_H
@@ -94,12 +95,17 @@ extern "C" {
 #define	ZONE_ATTR_INITNAME	9
 #define	ZONE_ATTR_BOOTARGS	10
 #define	ZONE_ATTR_BRAND		11
-#define	ZONE_ATTR_PHYS_MCAP	12
+#define	ZONE_ATTR_PMCAP_NOVER	12
 #define	ZONE_ATTR_SCHED_CLASS	13
 #define	ZONE_ATTR_FLAGS		14
 #define	ZONE_ATTR_HOSTID	15
 #define	ZONE_ATTR_FS_ALLOWED	16
 #define	ZONE_ATTR_NETWORK	17
+#define	ZONE_ATTR_DID		18
+#define	ZONE_ATTR_PMCAP_PAGEOUT	19
+#define	ZONE_ATTR_INITNORESTART	20
+#define	ZONE_ATTR_PG_FLT_DELAY	21
+#define	ZONE_ATTR_RSS		22
 
 /* Start of the brand-specific attribute namespace */
 #define	ZONE_ATTR_BRAND_ATTRS	32768
@@ -180,6 +186,7 @@ typedef struct {
 	uint32_t doi;			/* DOI for label */
 	caddr32_t label;		/* label associated with zone */
 	int flags;
+	zoneid_t zoneid;		/* requested zoneid */
 } zone_def32;
 #endif
 typedef struct {
@@ -196,6 +203,7 @@ typedef struct {
 	uint32_t doi;			/* DOI for label */
 	const bslabel_t *label;		/* label associated with zone */
 	int flags;
+	zoneid_t zoneid;		/* requested zoneid */
 } zone_def;
 
 /* extended error information */
@@ -240,7 +248,7 @@ typedef enum zone_cmd {
 typedef struct zone_cmd_arg {
 	uint64_t	uniqid;		/* unique "generation number" */
 	zone_cmd_t	cmd;		/* requested action */
-	uint32_t	_pad;		/* need consistent 32/64 bit alignmt */
+	uint32_t debug;			/* enable brand hook debug */
 	char locale[MAXPATHLEN];	/* locale in which to render messages */
 	char bootbuf[BOOTARGS_MAX];	/* arguments passed to zone_boot() */
 } zone_cmd_arg_t;
@@ -320,6 +328,7 @@ typedef struct zone_net_data {
  * libraries which may be defining ther own versions.
  */
 #include <sys/list.h>
+#include <sys/cpuvar.h>
 
 #define	GLOBAL_ZONEUNIQID	0	/* uniqid of the global zone */
 
@@ -367,7 +376,7 @@ typedef struct zone_dataset {
 } zone_dataset_t;
 
 /*
- * structure for zone kstats
+ * structure for rctl zone kstats
  */
 typedef struct zone_kstat {
 	kstat_named_t zk_zonename;
@@ -377,6 +386,73 @@ typedef struct zone_kstat {
 
 struct cpucap;
 
+typedef struct {
+	hrtime_t	cycle_start;
+	uint_t		cycle_cnt;
+	hrtime_t	zone_avg_cnt;
+} sys_zio_cntr_t;
+
+typedef struct {
+	kstat_named_t	zv_zonename;
+	kstat_named_t	zv_nread;
+	kstat_named_t	zv_reads;
+	kstat_named_t	zv_rtime;
+	kstat_named_t	zv_rlentime;
+	kstat_named_t	zv_nwritten;
+	kstat_named_t	zv_writes;
+	kstat_named_t	zv_wtime;
+	kstat_named_t	zv_wlentime;
+	kstat_named_t	zv_10ms_ops;
+	kstat_named_t	zv_100ms_ops;
+	kstat_named_t	zv_1s_ops;
+	kstat_named_t	zv_10s_ops;
+	kstat_named_t 	zv_delay_cnt;
+	kstat_named_t	zv_delay_time;
+} zone_vfs_kstat_t;
+
+typedef struct {
+	kstat_named_t	zz_zonename;
+	kstat_named_t	zz_nread;
+	kstat_named_t	zz_reads;
+	kstat_named_t	zz_rtime;
+	kstat_named_t	zz_rlentime;
+	kstat_named_t	zz_nwritten;
+	kstat_named_t	zz_writes;
+	kstat_named_t	zz_waittime;
+} zone_zfs_kstat_t;
+
+typedef struct {
+	kstat_named_t	zm_zonename;
+	kstat_named_t	zm_rss;
+	kstat_named_t	zm_phys_cap;
+	kstat_named_t	zm_swap;
+	kstat_named_t	zm_swap_cap;
+	kstat_named_t	zm_nover;
+	kstat_named_t	zm_pagedout;
+	kstat_named_t	zm_pgpgin;
+	kstat_named_t	zm_anonpgin;
+	kstat_named_t	zm_execpgin;
+	kstat_named_t	zm_fspgin;
+	kstat_named_t	zm_anon_alloc_fail;
+	kstat_named_t	zm_pf_throttle;
+	kstat_named_t	zm_pf_throttle_usec;
+} zone_mcap_kstat_t;
+
+typedef struct {
+	kstat_named_t	zm_zonename;	/* full name, kstat truncates name */
+	kstat_named_t	zm_utime;
+	kstat_named_t	zm_stime;
+	kstat_named_t	zm_wtime;
+	kstat_named_t	zm_avenrun1;
+	kstat_named_t	zm_avenrun5;
+	kstat_named_t	zm_avenrun15;
+	kstat_named_t	zm_run_ticks;
+	kstat_named_t	zm_run_wait;
+	kstat_named_t	zm_fss_shr_pct;
+	kstat_named_t	zm_fss_pri_hi;
+	kstat_named_t	zm_fss_pri_avg;
+} zone_misc_kstat_t;
+
 typedef struct zone {
 	/*
 	 * zone_name is never modified once set.
@@ -416,6 +492,7 @@ typedef struct zone {
 	 */
 	list_node_t	zone_linkage;
 	zoneid_t	zone_id;	/* ID of zone */
+	zoneid_t	zone_did;	/* persistent debug ID of zone */
 	uint_t		zone_ref;	/* count of zone_hold()s on zone */
 	uint_t		zone_cred_ref;	/* count of zone_hold_cred()s on zone */
 	/*
@@ -471,7 +548,7 @@ typedef struct zone {
 	char		*zone_initname;	/* fs path to 'init' */
 	int		zone_boot_err;  /* for zone_boot() if boot fails */
 	char		*zone_bootargs;	/* arguments passed via zone_boot() */
-	uint64_t	zone_phys_mcap;	/* physical memory cap */
+	rctl_qty_t	zone_phys_mem_ctl;	/* current phys. memory limit */
 	/*
 	 * zone_kthreads is protected by zone_status_lock.
 	 */
@@ -490,6 +567,9 @@ typedef struct zone {
 	hrtime_t	zone_pool_mod;	/* last pool bind modification time */
 	/* zone_psetid is protected by cpu_lock */
 	psetid_t	zone_psetid;	/* pset the zone is bound to */
+
+	time_t		zone_boot_time;	/* Similar to boot_time */
+
 	/*
 	 * The following two can be read without holding any locks.  They are
 	 * updated under cpu_lock.
@@ -517,6 +597,37 @@ typedef struct zone {
 	list_t		zone_dl_list;
 	netstack_t	*zone_netstack;
 	struct cpucap	*zone_cpucap;	/* CPU caps data */
+
+	/*
+	 * Data and counters used for ZFS fair-share disk IO.
+	 */
+	rctl_qty_t	zone_zfs_io_pri;	/* ZFS IO priority */
+	uint_t		zone_zfs_queued;	/* enqueued count */
+	uint64_t	zone_zfs_weight;	/* used to prevent starvation */
+	uint64_t	zone_io_util;		/* IO utilization metric */
+	boolean_t	zone_io_util_above_avg;	/* IO util percent > avg. */
+	uint16_t	zone_io_delay;		/* IO delay on logical r/w */
+	kmutex_t	zone_stg_io_lock;	/* protects IO window data */
+	sys_zio_cntr_t	zone_rd_ops;		/* Counters for ZFS reads, */
+	sys_zio_cntr_t	zone_wr_ops;		/* writes and logical writes. */
+	sys_zio_cntr_t	zone_lwr_ops;
+
+	/*
+	 * kstats and counters for VFS ops and bytes.
+	 */
+	kmutex_t	zone_vfs_lock;		/* protects VFS statistics */
+	kstat_t		*zone_vfs_ksp;
+	kstat_io_t	zone_vfs_rwstats;
+	zone_vfs_kstat_t *zone_vfs_stats;
+
+	/*
+	 * kstats for ZFS I/O ops and bytes.
+	 */
+	kmutex_t	zone_zfs_lock;		/* protects ZFS statistics */
+	kstat_t		*zone_zfs_ksp;
+	kstat_io_t	zone_zfs_rwstats;
+	zone_zfs_kstat_t *zone_zfs_stats;
+
 	/*
 	 * Solaris Auditing per-zone audit context
 	 */
@@ -534,6 +645,69 @@ typedef struct zone {
 	rctl_qty_t	zone_nprocs_ctl;	/* current limit protected by */
 						/* zone_rctls->rcs_lock */
 	kstat_t		*zone_nprocs_kstat;
+
+	/*
+	 * kstats and counters for physical memory capping.
+	 */
+	rctl_qty_t	zone_phys_mem;	/* current bytes of phys. mem. (RSS) */
+	kstat_t		*zone_physmem_kstat;
+	uint64_t	zone_mcap_nover;	/* # of times over phys. cap */
+	uint64_t	zone_mcap_pagedout;	/* bytes of mem. paged out */
+	kmutex_t	zone_mcap_lock;	/* protects mcap statistics */
+	kstat_t		*zone_mcap_ksp;
+	zone_mcap_kstat_t *zone_mcap_stats;
+	uint64_t	zone_pgpgin;		/* pages paged in */
+	uint64_t	zone_anonpgin;		/* anon pages paged in */
+	uint64_t	zone_execpgin;		/* exec pages paged in */
+	uint64_t	zone_fspgin;		/* fs pages paged in */
+	uint64_t	zone_anon_alloc_fail;	/* cnt of anon alloc fails */
+	uint64_t	zone_pf_throttle;	/* cnt of page flt throttles */
+	uint64_t	zone_pf_throttle_usec;	/* time of page flt throttles */
+
+	/* Num usecs to throttle page fault when zone is over phys. mem cap */
+	uint32_t	zone_pg_flt_delay;
+
+	/*
+	 * Misc. kstats and counters for zone cpu-usage aggregation.
+	 * The zone_Xtime values are the sum of the micro-state accounting
+	 * values for all threads that are running or have run in the zone.
+	 * This is tracked in msacct.c as threads change state.
+	 * The zone_stime is the sum of the LMS_SYSTEM times.
+	 * The zone_utime is the sum of the LMS_USER times.
+	 * The zone_wtime is the sum of the LMS_WAIT_CPU times.
+	 * As with per-thread micro-state accounting values, these values are
+	 * not scaled to nanosecs.  The scaling is done by the
+	 * zone_misc_kstat_update function when kstats are requested.
+	 */
+	kmutex_t	zone_misc_lock;		/* protects misc statistics */
+	kstat_t		*zone_misc_ksp;
+	zone_misc_kstat_t *zone_misc_stats;
+	uint64_t	zone_stime;		/* total system time */
+	uint64_t	zone_utime;		/* total user time */
+	uint64_t	zone_wtime;		/* total time waiting in runq */
+
+	struct loadavg_s zone_loadavg;		/* loadavg for this zone */
+	uint64_t	zone_hp_avenrun[3];	/* high-precision avenrun */
+	int		zone_avenrun[3];	/* FSCALED avg. run queue len */
+
+	/*
+	 * FSS stats updated once per second by fss_decay_usage.
+	 */
+	uint32_t	zone_fss_gen;		/* FSS generation cntr */
+	uint64_t	zone_run_ticks;		/* tot # of ticks running */
+
+	/*
+	 * DTrace-private per-zone state
+	 */
+	int		zone_dtrace_getf;	/* # of unprivileged getf()s */
+
+	/*
+	 * Synchronization primitives used to synchronize between mounts and
+	 * zone creation/destruction.
+	 */
+	int		zone_mounts_in_progress;
+	kcondvar_t	zone_mount_cv;
+	kmutex_t	zone_mount_lock;
 } zone_t;
 
 /*
@@ -566,9 +740,11 @@ extern zone_t *zone_find_by_name(char *);
 extern zone_t *zone_find_by_any_path(const char *, boolean_t);
 extern zone_t *zone_find_by_path(const char *);
 extern zoneid_t getzoneid(void);
+extern zoneid_t getzonedid(void);
 extern zone_t *zone_find_by_id_nolock(zoneid_t);
 extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *);
 extern int zone_check_datalink(zoneid_t *, datalink_id_t);
+extern void zone_loadavg_update();
 
 /*
  * Zone-specific data (ZSD) APIs
@@ -752,13 +928,14 @@ extern int zone_dataset_visible(const char *, int *);
 extern int zone_kadmin(int, int, const char *, cred_t *);
 extern void zone_shutdown_global(void);
 
-extern void mount_in_progress(void);
-extern void mount_completed(void);
+extern void mount_in_progress(zone_t *);
+extern void mount_completed(zone_t *);
 
 extern int zone_walk(int (*)(zone_t *, void *), void *);
 
 extern rctl_hndl_t rc_zone_locked_mem;
 extern rctl_hndl_t rc_zone_max_swap;
+extern rctl_hndl_t rc_zone_phys_mem;
 extern rctl_hndl_t rc_zone_max_lofi;
 
 #endif	/* _KERNEL */
diff --git a/usr/src/uts/common/syscall/getloadavg.c b/usr/src/uts/common/syscall/getloadavg.c
index c669f9b8ba..0f44064e90 100644
--- a/usr/src/uts/common/syscall/getloadavg.c
+++ b/usr/src/uts/common/syscall/getloadavg.c
@@ -22,10 +22,9 @@
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
@@ -41,7 +40,6 @@ int
 getloadavg(int *buf, int nelem)
 {
 	int *loadbuf = &avenrun[0];
-	int loadavg[LOADAVG_NSTATS];
 	int error;
 
 	if (nelem < 0)
@@ -50,15 +48,7 @@ getloadavg(int *buf, int nelem)
 		nelem = LOADAVG_NSTATS;
 
 	if (!INGLOBALZONE(curproc)) {
-		mutex_enter(&cpu_lock);
-		if (pool_pset_enabled()) {
-			psetid_t psetid = zone_pset_get(curproc->p_zone);
-
-			error = cpupart_get_loadavg(psetid, &loadavg[0], nelem);
-			ASSERT(error == 0);	/* pset isn't going anywhere */
-			loadbuf = &loadavg[0];
-		}
-		mutex_exit(&cpu_lock);
+		loadbuf = &curproc->p_zone->zone_avenrun[0];
 	}
 
 	error = copyout(loadbuf, buf, nelem * sizeof (avenrun[0]));
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ab3a8b65e..63c8b64ad0 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -116,13 +117,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
 		 * MS_SYNC used to be defined to be zero but is now non-zero.
 		 * For binary compatibility we still accept zero
 		 * (the absence of MS_ASYNC) to mean the same thing.
+		 * Binary compatibility is not an issue for MS_INVALCURPROC.
 		 */
 		iarg = (uintptr_t)arg;
 		if ((iarg & ~MS_INVALIDATE) == 0)
 			iarg |= MS_SYNC;
 
-		if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
-			((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+		if (((iarg &
+		    ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+		    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+		    ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+		    (MS_INVALIDATE|MS_INVALCURPROC))) {
 			error = set_errno(EINVAL);
 		} else {
 			error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 471c66ff32..e68f640045 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -158,8 +159,8 @@ sysconfig(int which)
 		 * even though rcapd can be used on the global zone too.
 		 */
 		if (!INGLOBALZONE(curproc) &&
-		    curproc->p_zone->zone_phys_mcap != 0)
-			return (MIN(btop(curproc->p_zone->zone_phys_mcap),
+		    curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX)
+			return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl),
 			    physinstalled));
 
 		return (physinstalled);
@@ -167,26 +168,23 @@ sysconfig(int which)
 	case _CONFIG_AVPHYS_PAGES:
 		/*
 		 * If the non-global zone has a phys. memory cap, use
-		 * the phys. memory cap - zone's current rss.  We always
+		 * the phys. memory cap - zone's rss.  We always
 		 * report the system-wide value for the global zone, even
-		 * though rcapd can be used on the global zone too.
+		 * though memory capping can be used on the global zone too.
+		 * We use the cached value for the RSS since vm_getusage()
+		 * is so expensive and we don't need this value to be exact.
 		 */
 		if (!INGLOBALZONE(curproc) &&
-		    curproc->p_zone->zone_phys_mcap != 0) {
+		    curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) {
 			pgcnt_t cap, rss, free;
-			vmusage_t in_use;
-			size_t cnt = 1;
 
-			cap = btop(curproc->p_zone->zone_phys_mcap);
+			cap = btop(curproc->p_zone->zone_phys_mem_ctl);
 			if (cap > physinstalled)
 				return (freemem);
 
-			if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt,
-			    FKIOCTL) != 0)
-				in_use.vmu_rss_all = 0;
-			rss = btop(in_use.vmu_rss_all);
+			rss = btop(curproc->p_zone->zone_phys_mem);
 			/*
-			 * Because rcapd implements a soft cap, it is possible
+			 * Because this is a soft cap, it is possible
 			 * for rss to be temporarily over the cap.
 			 */
 			if (cap > rss)
diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c
index 1bdfbbfd0b..dbff1b637c 100644
--- a/usr/src/uts/common/syscall/uadmin.c
+++ b/usr/src/uts/common/syscall/uadmin.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/param.h>
@@ -76,7 +77,7 @@ volatile int fastreboot_dryrun = 0;
  * system with many zones.
  */
 void
-killall(zoneid_t zoneid)
+killall(zoneid_t zoneid, boolean_t force)
 {
 	proc_t *p;
 
@@ -106,7 +107,7 @@ killall(zoneid_t zoneid)
 		    p->p_stat != SIDL &&
 		    p->p_stat != SZOMB) {
 			mutex_enter(&p->p_lock);
-			if (sigismember(&p->p_sig, SIGKILL)) {
+			if (!force && sigismember(&p->p_sig, SIGKILL)) {
 				mutex_exit(&p->p_lock);
 				p = p->p_next;
 			} else {
@@ -243,7 +244,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
 		 */
 		zone_shutdown_global();
 
-		killall(ALL_ZONES);
+		killall(ALL_ZONES, B_FALSE);
 		/*
 		 * If we are calling kadmin() from a kernel context then we
 		 * do not release these resources.
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..156b810046 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -460,6 +461,7 @@ void	hat_setstat(struct as *, caddr_t, size_t, uint_t);
  */
 #define	HAT_ADV_PGUNLOAD	0x00
 #define	HAT_FORCE_PGUNLOAD	0x01
+#define	HAT_CURPROC_PGUNLOAD	0x02
 
 /*
  * Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 31c293d416..5f106f6c06 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -7254,7 +7255,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 	vpp = svd->vpage;
 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
-	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+	    ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+	    ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
 
 	if (attr) {
 		pageprot = attr & ~(SHARED|PRIVATE);
@@ -7279,11 +7281,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 			vpp = &svd->vpage[seg_page(seg, addr)];
 
 	} else if (svd->vp && svd->amp == NULL &&
-	    (flags & MS_INVALIDATE) == 0) {
+	    (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
 
 		/*
-		 * No attributes, no anonymous pages and MS_INVALIDATE flag
-		 * is not on, just use one big request.
+		 * No attributes, no anonymous pages and MS_INVAL* flags
+		 * are not on, just use one big request.
 		 */
 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
 		    bflags, svd->cred, NULL);
@@ -7335,7 +7337,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
 		 * might race in and lock the page after we unlock and before
 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
 		 */
-		if (flags & MS_INVALIDATE) {
+		if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 					page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index fdf9f7790c..f30ba7ef2e 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1986, 2010, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -792,6 +793,7 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
 		mutex_enter(&p->p_lock);
 		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
 			mutex_exit(&p->p_lock);
+			atomic_add_64(&zone->zone_anon_alloc_fail, 1);
 			return (0);
 		}
 		mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 01ad32e0b1..8caa257486 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc.  All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -56,6 +57,7 @@
 #include <sys/debug.h>
 #include <sys/tnf_probe.h>
 #include <sys/vtrace.h>
+#include <sys/ddi.h>
 
 #include <vm/hat.h>
 #include <vm/xhat.h>
@@ -879,6 +881,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 	struct seg *segsav;
 	int as_lock_held;
 	klwp_t *lwp = ttolwp(curthread);
+	zone_t *zonep = curzone;
 	int is_xhat = 0;
 	int holding_wpage = 0;
 	extern struct seg_ops   segdev_ops;
@@ -928,6 +931,23 @@ retry:
 			if (as == &kas)
 				CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 			CPU_STATS_EXIT_K();
+			if (zonep->zone_pg_flt_delay != 0) {
+				/*
+				 * The zone in which this process is running
+				 * is currently over it's physical memory cap.
+				 * Throttle page faults to help the user-land
+				 * memory capper catch up. Note that
+				 * drv_usectohz() rounds up.
+				 */
+				atomic_add_64(&zonep->zone_pf_throttle, 1);
+				atomic_add_64(&zonep->zone_pf_throttle_usec,
+				    zonep->zone_pg_flt_delay);
+				if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
+					drv_usecwait(zonep->zone_pg_flt_delay);
+				else
+					delay(drv_usectohz(
+					    zonep->zone_pg_flt_delay));
+			}
 			break;
 		}
 	}
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 7233581227..39ace0b3c2 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags)
 				page_io_unlock(pp);
 				page_unlock(pp);
 			}
-		} else if (flags & B_INVAL) {
+		} else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+			/*
+			 * If B_INVALCURONLY is set, then we handle that case
+			 * in the next conditional if hat_page_is_mapped()
+			 * indicates that there are no additional mappings
+			 * to the page.
+			 */
+
 			/*
 			 * XXX - Failed writes with B_INVAL set are
 			 * not handled appropriately.
@@ -572,8 +580,9 @@ pvn_write_done(page_t *plist, int flags)
 }
 
 /*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
  * operation and is only to be considered if it doesn't involve any
  * waiting here.  B_TRUNC indicates that the file is being truncated
  * and so no i/o needs to be done. B_FORCE indicates that the page
@@ -627,13 +636,17 @@ pvn_getdirty(page_t *pp, int flags)
 	 * If we want to free or invalidate the page then
 	 * we need to unload it so that anyone who wants
 	 * it will have to take a minor fault to get it.
+	 * If we are only invalidating the page for the
+	 * current process, then pass in a different flag.
 	 * Otherwise, we're just writing the page back so we
 	 * need to sync up the hardwre and software mod bit to
 	 * detect any future modifications.  We clear the
 	 * software mod bit when we put the page on the dirty
 	 * list.
 	 */
-	if (flags & (B_INVAL | B_FREE)) {
+	if (flags & B_INVALCURONLY) {
+		(void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+	} else if (flags & (B_INVAL | B_FREE)) {
 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	} else {
 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -645,7 +658,7 @@ pvn_getdirty(page_t *pp, int flags)
 		 * list after all.
 		 */
 		page_io_unlock(pp);
-		if (flags & B_INVAL) {
+		if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 			/*LINTED: constant in conditional context*/
 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
 		} else if (flags & B_FREE) {
@@ -657,6 +670,9 @@ pvn_getdirty(page_t *pp, int flags)
 			 * of VOP_PUTPAGE() who prefer freeing the
 			 * page _only_ if no one else is accessing it.
 			 * E.g. segmap_release()
+			 * We also take this path for B_INVALCURONLY and
+			 * let page_release call VN_DISPOSE if no one else is
+			 * using the page.
 			 *
 			 * The above hat_ismod() check is useless because:
 			 * (1) we may not be holding SE_EXCL lock;
@@ -681,7 +697,7 @@ pvn_getdirty(page_t *pp, int flags)
 	 * We'll detect the fact that they used it when the
 	 * i/o is done and avoid freeing the page.
 	 */
-	if (flags & B_FREE)
+	if (flags & (B_FREE | B_INVALCURONLY))
 		page_downgrade(pp);
 
 
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index d422f8d0e8..8f425e9e4f 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
  * vm_usage
  *
  * This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
  *	For accurate counting of map-shared and COW-shared pages.
  *
  *    - visited private anons (refcnt > 1) for each collective.
- *	(entity->vme_anon_hash)
+ *	(entity->vme_anon)
  *	For accurate counting of COW-shared pages.
  *
  * The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
 #include <sys/vm_usage.h>
 #include <sys/zone.h>
 #include <sys/sunddi.h>
+#include <sys/sysmacros.h>
 #include <sys/avl.h>
 #include <vm/anon.h>
 #include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
 } vmu_object_t;
 
 /*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+	avl_node_t vma_node;
+	uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
  * Entity by which to count results.
  *
  * The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
 	struct vmu_entity *vme_next_calc;
 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
-	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
+	avl_tree_t	vme_anon;	 /* COW anons visited for entity */
 	vmusage_t	vme_result;	 /* identifies entity and results */
 } vmu_entity_t;
 
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
 }
 
 /*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+	const vmu_anon_t *l = lhs, *r = rhs;
+
+	if (l->vma_addr == r->vma_addr)
+		return (0);
+
+	if (l->vma_addr < r->vma_addr)
+		return (-1);
+
+	return (1);
+}
+
+/*
  * Save a bound on the free list.
  */
 static void
@@ -363,13 +393,18 @@ static void
 vmu_free_entity(mod_hash_val_t val)
 {
 	vmu_entity_t *entity = (vmu_entity_t *)val;
+	vmu_anon_t *anon;
+	void *cookie = NULL;
 
 	if (entity->vme_vnode_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 	if (entity->vme_amp_hash != NULL)
 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
-	if (entity->vme_anon_hash != NULL)
-		i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+	while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+		kmem_free(anon, sizeof (vmu_anon_t));
+
+	avl_destroy(&entity->vme_anon);
 
 	entity->vme_next = vmu_data.vmu_free_entities;
 	vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 		    sizeof (struct anon_map));
 
-	if (entity->vme_anon_hash == NULL)
-		entity->vme_anon_hash = mod_hash_create_ptrhash(
-		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
-		    mod_hash_null_valdtor, sizeof (struct anon));
+	VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+	avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+	    offsetof(struct vmu_anon, vma_node));
 
 	entity->vme_next = vmu_data.vmu_entities;
 	vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
 
 	zone->vmz_id = id;
 
-	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+	if ((vmu_data.vmu_calc_flags &
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 
 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 }
 
 static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
 {
-	int ret;
-	caddr_t val;
+	vmu_anon_t anon, *ap;
 
-	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t *)&val);
+	anon.vma_addr = (uintptr_t)key;
 
-	if (ret == 0)
+	if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
 		return (0);
 
-	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
-	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+	ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+	ap->vma_addr = (uintptr_t)key;
 
-	ASSERT(ret == 0);
+	avl_add(&entity->vme_anon, ap);
 
 	return (1);
 }
@@ -937,7 +971,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 
 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 			    (page = page_exists(vn, off)) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -1024,7 +1061,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
 
 			if (vnode->v_pages != NULL &&
 			    (page = page_exists(vnode, ptob(index))) != NULL) {
-				page_type = VMUSAGE_BOUND_INCORE;
+				if (PP_ISFREE(page))
+					page_type = VMUSAGE_BOUND_NOT_INCORE;
+				else
+					page_type = VMUSAGE_BOUND_INCORE;
 				if (page->p_szc > 0) {
 					pgcnt = page_get_pagecnt(page->p_szc);
 					pgshft = page_get_shift(page->p_szc);
@@ -1304,6 +1344,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 			}
 
 			/*
+			 * Pages on the free list aren't counted for the rss.
+			 */
+			if (PP_ISFREE(page))
+				continue;
+
+			/*
 			 * Assume anon structs with a refcnt
 			 * of 1 are not COW shared, so there
 			 * is no reason to track them per entity.
@@ -1320,8 +1366,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
 				 * Track COW anons per entity so
 				 * they are not double counted.
 				 */
-				if (vmu_find_insert_anon(entity->vme_anon_hash,
-				    (caddr_t)ap) == 0)
+				if (vmu_find_insert_anon(entity, ap) == 0)
 					continue;
 
 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1506,9 @@ vmu_calculate_proc(proc_t *p)
 		entities = tmp;
 	}
 	if (vmu_data.vmu_calc_flags &
-	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
-	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+	    VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+	    VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
 	    VMUSAGE_ALL_EUSERS)) {
 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1594,8 +1640,7 @@ vmu_free_extra()
 			mod_hash_destroy_hash(te->vme_vnode_hash);
 		if (te->vme_amp_hash != NULL)
 			mod_hash_destroy_hash(te->vme_amp_hash);
-		if (te->vme_anon_hash != NULL)
-			mod_hash_destroy_hash(te->vme_anon_hash);
+		VERIFY(avl_first(&te->vme_anon) == NULL);
 		kmem_free(te, sizeof (vmu_entity_t));
 	}
 	while (vmu_data.vmu_free_zones != NULL) {
@@ -1739,12 +1784,34 @@ vmu_cache_rele(vmu_cache_t *cache)
 }
 
 /*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+static void
+vmu_update_zone_rctls(vmu_cache_t *cache)
+{
+	vmusage_t	*rp;
+	size_t		i = 0;
+	zone_t		*zp;
+
+	for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+		if (rp->vmu_type == VMUSAGE_ZONE &&
+		    rp->vmu_zoneid != ALL_ZONES) {
+			if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+				zp->zone_phys_mem = rp->vmu_rss_all;
+				zone_rele(zp);
+			}
+		}
+	}
+}
+
+/*
  * Copy out the cached results to a caller.  Inspect the callers flags
  * and zone to determine which cached results should be copied.
  */
 static int
 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
-    uint_t flags, int cpflg)
+    uint_t flags, id_t req_zone_id, int cpflg)
 {
 	vmusage_t *result, *out_result;
 	vmusage_t dummy;
@@ -1763,7 +1830,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 	/* figure out what results the caller is interested in. */
 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
 		types |= VMUSAGE_SYSTEM;
-	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
 		types |= VMUSAGE_ZONE;
 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
 	    VMUSAGE_COL_PROJECTS))
@@ -1826,26 +1893,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
 				continue;
 		}
 
-		/* Skip "other zone" results if not requested */
-		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
-			if (result->vmu_type == VMUSAGE_ZONE &&
-			    (flags & VMUSAGE_ALL_ZONES) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_PROJECTS &&
-			    (flags & (VMUSAGE_ALL_PROJECTS |
-			    VMUSAGE_COL_PROJECTS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_TASKS &&
-			    (flags & VMUSAGE_ALL_TASKS) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_RUSERS &&
-			    (flags & (VMUSAGE_ALL_RUSERS |
-			    VMUSAGE_COL_RUSERS)) == 0)
-				continue;
-			if (result->vmu_type == VMUSAGE_EUSERS &&
-			    (flags & (VMUSAGE_ALL_EUSERS |
-			    VMUSAGE_COL_EUSERS)) == 0)
+		if (result->vmu_type == VMUSAGE_ZONE &&
+		    flags & VMUSAGE_A_ZONE) {
+			/* Skip non-requested zone results */
+			if (result->vmu_zoneid != req_zone_id)
 				continue;
+		} else {
+			/* Skip "other zone" results if not requested */
+			if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+				if (result->vmu_type == VMUSAGE_ZONE &&
+				    (flags & VMUSAGE_ALL_ZONES) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_PROJECTS &&
+				    (flags & (VMUSAGE_ALL_PROJECTS |
+				    VMUSAGE_COL_PROJECTS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_TASKS &&
+				    (flags & VMUSAGE_ALL_TASKS) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_RUSERS &&
+				    (flags & (VMUSAGE_ALL_RUSERS |
+				    VMUSAGE_COL_RUSERS)) == 0)
+					continue;
+				if (result->vmu_type == VMUSAGE_EUSERS &&
+				    (flags & (VMUSAGE_ALL_EUSERS |
+				    VMUSAGE_COL_EUSERS)) == 0)
+					continue;
+			}
 		}
 		count++;
 		if (out_result != NULL) {
@@ -1901,10 +1975,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	int cacherecent = 0;
 	hrtime_t now;
 	uint_t flags_orig;
+	id_t req_zone_id;
 
 	/*
 	 * Non-global zones cannot request system wide and/or collated
-	 * results, or the system result, so munge the flags accordingly.
+	 * results, or the system result, or usage of another zone, so munge
+	 * the flags accordingly.
 	 */
 	flags_orig = flags;
 	if (curproc->p_zone != global_zone) {
@@ -1924,6 +2000,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 			flags &= ~VMUSAGE_SYSTEM;
 			flags |= VMUSAGE_ZONE;
 		}
+		if (flags & VMUSAGE_A_ZONE) {
+			flags &= ~VMUSAGE_A_ZONE;
+			flags |= VMUSAGE_ZONE;
+		}
 	}
 
 	/* Check for unknown flags */
@@ -1934,6 +2014,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
 	if ((flags & VMUSAGE_MASK) == 0)
 		return (set_errno(EINVAL));
 
+	/* If requesting results for a specific zone, get the zone ID */
+	if (flags & VMUSAGE_A_ZONE) {
+		size_t bufsize;
+		vmusage_t zreq;
+
+		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+			return (set_errno(EFAULT));
+		/* Requested zone ID is passed in buf, so 0 len not allowed */
+		if (bufsize == 0)
+			return (set_errno(EINVAL));
+		if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+			return (set_errno(EFAULT));
+		req_zone_id = zreq.vmu_id;
+	}
+
 	mutex_enter(&vmu_data.vmu_lock);
 	now = gethrtime();
 
@@ -1953,7 +2048,7 @@ start:
 			mutex_exit(&vmu_data.vmu_lock);
 
 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
-			    cpflg);
+			    req_zone_id, cpflg);
 			mutex_enter(&vmu_data.vmu_lock);
 			vmu_cache_rele(cache);
 			if (vmu_data.vmu_pending_waiters > 0)
@@ -2009,8 +2104,11 @@ start:
 
 		mutex_exit(&vmu_data.vmu_lock);
 
+		/* update zone's phys. mem. rctl usage */
+		vmu_update_zone_rctls(cache);
 		/* copy cache */
-		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+		ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+		    req_zone_id, cpflg);
 		mutex_enter(&vmu_data.vmu_lock);
 		vmu_cache_rele(cache);
 		mutex_exit(&vmu_data.vmu_lock);