4489 need ptcumem

Reviewed by: Bryan Cantrill <bryan@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net> Approved by: Garrett D'Amore <garrett@damore.org>
author: Robert Mustacchi <rm@joyent.com> 2013-09-29 13:06:51 -0700
committer: Robert Mustacchi <rm@joyent.com> 2014-01-21 18:20:39 -0800
commit: 4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72 (patch)
tree: 8f95ebd8dfeb9ab49e53704d900b2d0f0f217b37
parent: 38849194df07385a46363bb46861688fde59a98a (diff)
download: illumos-joyent-4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72.tar.gz
31 files changed, 2184 insertions, 55 deletions
diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime
index ce606dc0ea..fce897b09b 100644
--- a/exception_lists/check_rtime
+++ b/exception_lists/check_rtime
@@ -63,6 +63,8 @@ SKIP		^usr/lib/sysevent/modules/picl_slm.so$
 # Objects that are allowed to have executable data segments
 EXEC_DATA	^MACH(lib)/ld\.so\.1$
 EXEC_DATA	^lib/libc\.so\.1$	# 6524709, 32-bit, needed for x86 only
+EXEC_DATA	^lib/amd64/libumem\.so\.1$ # ptcumem
+EXEC_DATA	^lib/libumem\.so\.1$	# ptcumem
 EXEC_DATA	^opt/SUNWdtrt/tst/.*/ustack/tst\.helper\.exe$
 EXEC_DATA	^platform/.*/MACH(kernel)/unix$
 EXEC_DATA	^platform/.*/multiboot$
diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c b/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
index 4193b0dcd5..c2289ec7f7 100644
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_debug.h>
 #include <mdb/mdb_err.h>
 #include <mdb/mdb_io.h>
@@ -101,6 +99,14 @@ umem_atomic_add_64(uint64_t *target, int64_t delta)
 	*target = *target + delta;
 }
 
+uint64_t
+umem_atomic_swap_64(volatile uint64_t *t, uint64_t v)
+{
+	uint64_t old = *t;
+	*t = v;
+	return (old);
+}
+
 /*
  * Standalone umem must be manually initialized
  */
diff --git a/usr/src/cmd/mdb/common/modules/libc/libc.c b/usr/src/cmd/mdb/common/modules/libc/libc.c
index 27dcade228..44e4f49b87 100644
--- a/usr/src/cmd/mdb/common/modules/libc/libc.c
+++ b/usr/src/cmd/mdb/common/modules/libc/libc.c
@@ -23,6 +23,9 @@
  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
+/*
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ */
 
 #include <sys/mdb_modapi.h>
 #include <mdb/mdb_whatis.h>
@@ -681,6 +684,12 @@ d_ulwp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	    prt_addr((void *)(addr + OFFSET(ul_spinlock)), 1),
 	    prt_addr((void *)(addr + OFFSET(ul_fpuenv)), 0));
 
+	HD("tmem.size             &tmem.roots");
+	mdb_printf(OFFSTR "%-21H %s\n",
+	    OFFSET(ul_tmem),
+	    ulwp.ul_tmem.tm_size,
+	    prt_addr((void *)(addr + OFFSET(ul_tmem) + sizeof (size_t)), 0));
+
 	return (DCMD_OK);
 }
 
diff --git a/usr/src/cmd/mdb/common/modules/libumem/libumem.c b/usr/src/cmd/mdb/common/modules/libumem/libumem.c
index 4a77c5aa82..0984edbdf0 100644
--- a/usr/src/cmd/mdb/common/modules/libumem/libumem.c
+++ b/usr/src/cmd/mdb/common/modules/libumem/libumem.c
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
 #include "umem.h"
 #include <libproc.h>
 #include <mdb/mdb_modapi.h>
@@ -34,6 +38,8 @@
 
 #include <umem_impl.h>
 #include <sys/vmem_impl_user.h>
+#include <thr_uberdata.h>
+#include <stdio.h>
 
 #include "umem_pagesize.h"
 
@@ -44,24 +50,33 @@ typedef struct datafmt {
 	char	*fmt;
 } datafmt_t;
 
+static datafmt_t ptcfmt[] = {
+	{ "   ",	"tid",		"---",		"%3u "		},
+	{ " memory",	" cached",	"-------",	"%7lH "		},
+	{ "  %",	"cap",		"---",		"%3u "		},
+	{ "  %",	NULL,		"---",		"%3u "		},
+	{ NULL,		NULL,		NULL,		NULL		}
+};
+
 static datafmt_t umemfmt[] = {
 	{ "cache                    ", "name                     ",
 	"-------------------------", "%-25s "				},
 	{ "   buf",	"  size",	"------",	"%6u "		},
-	{ "   buf",	"in use",	"------",	"%6u "		},
-	{ "   buf",	" total",	"------",	"%6u "		},
-	{ "   memory",	"   in use",	"---------",	"%9u "		},
+	{ "    buf",	" in use",	"-------",	"%7u "		},
+	{ "    buf",	" in ptc",	"-------",	"%7s "		},
+	{ "    buf",	"  total",	"-------",	"%7u "		},
+	{ " memory",	" in use",	"-------",	"%7H "		},
 	{ "    alloc",	"  succeed",	"---------",	"%9u "		},
-	{ "alloc",	" fail",	"-----",	"%5llu "	},
+	{ "alloc",	" fail",	"-----",	"%5llu"		},
 	{ NULL,		NULL,		NULL,		NULL		}
 };
 
 static datafmt_t vmemfmt[] = {
 	{ "vmem                     ", "name                     ",
 	"-------------------------", "%-*s "				},
-	{ "   memory",	"   in use",	"---------",	"%9llu "	},
-	{ "    memory",	"     total",	"----------",	"%10llu "	},
-	{ "   memory",	"   import",	"---------",	"%9llu "	},
+	{ "   memory",	"   in use",	"---------",	"%9H "		},
+	{ "    memory",	"     total",	"----------",	"%10H "		},
+	{ "   memory",	"   import",	"---------",	"%9H "		},
 	{ "    alloc",	"  succeed",	"---------",	"%9llu "	},
 	{ "alloc",	" fail",	"-----",	"%5llu "	},
 	{ NULL,		NULL,		NULL,		NULL		}
@@ -105,14 +120,105 @@ typedef struct umastat_vmem {
 	int kv_fail;
 } umastat_vmem_t;
 
+/*ARGSUSED*/
+static int
+umastat_cache_nptc(uintptr_t addr, const umem_cache_t *cp, int *nptc)
+{
+	if (!(cp->cache_flags & UMF_PTC))
+		return (WALK_NEXT);
+
+	(*nptc)++;
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_cache_hdr(uintptr_t addr, const umem_cache_t *cp, void *ignored)
+{
+	if (!(cp->cache_flags & UMF_PTC))
+		return (WALK_NEXT);
+
+	mdb_printf("%3d ", cp->cache_bufsize);
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp_ptc(uintptr_t addr, void *buf, int *nbufs)
+{
+	(*nbufs)++;
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp_cache(uintptr_t addr, const umem_cache_t *cp, ulwp_t *ulwp)
+{
+	char walk[60];
+	int nbufs = 0;
+
+	if (!(cp->cache_flags & UMF_PTC))
+		return (WALK_NEXT);
+
+	(void) snprintf(walk, sizeof (walk), "umem_ptc_%d", cp->cache_bufsize);
+
+	if (mdb_pwalk(walk, (mdb_walk_cb_t)umastat_lwp_ptc,
+	    &nbufs, (uintptr_t)ulwp->ul_self) == -1) {
+		mdb_warn("unable to walk '%s'", walk);
+		return (WALK_ERR);
+	}
+
+	mdb_printf("%3d ", ulwp->ul_tmem.tm_size ?
+	    (nbufs * cp->cache_bufsize * 100) / ulwp->ul_tmem.tm_size : 0);
+
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp(uintptr_t addr, const ulwp_t *ulwp, void *ignored)
+{
+	size_t size;
+	datafmt_t *dfp = ptcfmt;
+
+	mdb_printf((dfp++)->fmt, ulwp->ul_lwpid);
+	mdb_printf((dfp++)->fmt, ulwp->ul_tmem.tm_size);
+
+	if (umem_readvar(&size, "umem_ptc_size") == -1) {
+		mdb_warn("unable to read 'umem_ptc_size'");
+		return (WALK_ERR);
+	}
+
+	mdb_printf((dfp++)->fmt, (ulwp->ul_tmem.tm_size * 100) / size);
+
+	if (mdb_walk("umem_cache",
+	    (mdb_walk_cb_t)umastat_lwp_cache, (void *)ulwp) == -1) {
+		mdb_warn("can't walk 'umem_cache'");
+		return (WALK_ERR);
+	}
+
+	mdb_printf("\n");
+
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_cache_ptc(uintptr_t addr, const void *ignored, int *nptc)
+{
+	(*nptc)++;
+	return (WALK_NEXT);
+}
+
 static int
 umastat_cache(uintptr_t addr, const umem_cache_t *cp, umastat_vmem_t **kvp)
 {
 	umastat_vmem_t *kv;
 	datafmt_t *dfp = umemfmt;
+	char buf[10];
 	int magsize;
 
-	int avail, alloc, total;
+	int avail, alloc, total, nptc = 0;
 	size_t meminuse = (cp->cache_slab_create - cp->cache_slab_destroy) *
 	    cp->cache_slabsize;
 
@@ -130,6 +236,21 @@ umastat_cache(uintptr_t addr, const umem_cache_t *cp, umastat_vmem_t **kvp)
 	(void) mdb_pwalk("umem_cpu_cache", cpu_avail, &avail, addr);
 	(void) mdb_pwalk("umem_slab_partial", slab_avail, &avail, addr);
 
+	if (cp->cache_flags & UMF_PTC) {
+		char walk[60];
+
+		(void) snprintf(walk, sizeof (walk),
+		    "umem_ptc_%d", cp->cache_bufsize);
+
+		if (mdb_walk(walk,
+		    (mdb_walk_cb_t)umastat_cache_ptc, &nptc) == -1) {
+			mdb_warn("unable to walk '%s'", walk);
+			return (WALK_ERR);
+		}
+
+		(void) snprintf(buf, sizeof (buf), "%d", nptc);
+	}
+
 	for (kv = *kvp; kv != NULL; kv = kv->kv_next) {
 		if (kv->kv_addr == (uintptr_t)cp->cache_arena)
 			goto out;
@@ -147,6 +268,7 @@ out:
 	mdb_printf((dfp++)->fmt, cp->cache_name);
 	mdb_printf((dfp++)->fmt, cp->cache_bufsize);
 	mdb_printf((dfp++)->fmt, total - avail);
+	mdb_printf((dfp++)->fmt, cp->cache_flags & UMF_PTC ? buf : "-");
 	mdb_printf((dfp++)->fmt, total);
 	mdb_printf((dfp++)->fmt, meminuse);
 	mdb_printf((dfp++)->fmt, alloc);
@@ -165,8 +287,8 @@ umastat_vmem_totals(uintptr_t addr, const vmem_t *v, umastat_vmem_t *kv)
 	if (kv == NULL || kv->kv_alloc == 0)
 		return (WALK_NEXT);
 
-	mdb_printf("Total [%s]%*s %6s %6s %6s %9u %9u %5u\n", v->vm_name,
-	    17 - strlen(v->vm_name), "", "", "", "",
+	mdb_printf("Total [%s]%*s %6s %7s %7s %7s %7H %9u %5u\n", v->vm_name,
+	    17 - strlen(v->vm_name), "", "", "", "", "",
 	    kv->kv_meminuse, kv->kv_alloc, kv->kv_fail);
 
 	return (WALK_NEXT);
@@ -209,20 +331,67 @@ umastat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	umastat_vmem_t *kv = NULL;
 	datafmt_t *dfp;
+	int nptc = 0, i;
 
 	if (argc != 0)
 		return (DCMD_USAGE);
 
+	/*
+	 * We need to determine if we have any caches that have per-thread
+	 * caching enabled.
+	 */
+	if (mdb_walk("umem_cache",
+	    (mdb_walk_cb_t)umastat_cache_nptc, &nptc) == -1) {
+		mdb_warn("can't walk 'umem_cache'");
+		return (DCMD_ERR);
+	}
+
+	if (nptc) {
+		for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+			mdb_printf("%s ", dfp->hdr1);
+
+		for (i = 0; i < nptc; i++)
+			mdb_printf("%s ", dfp->hdr1);
+
+		mdb_printf("\n");
+
+		for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+			mdb_printf("%s ", dfp->hdr2);
+
+		if (mdb_walk("umem_cache",
+		    (mdb_walk_cb_t)umastat_cache_hdr, NULL) == -1) {
+			mdb_warn("can't walk 'umem_cache'");
+			return (DCMD_ERR);
+		}
+
+		mdb_printf("\n");
+
+		for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+			mdb_printf("%s ", dfp->dashes);
+
+		for (i = 0; i < nptc; i++)
+			mdb_printf("%s ", dfp->dashes);
+
+		mdb_printf("\n");
+
+		if (mdb_walk("ulwp", (mdb_walk_cb_t)umastat_lwp, NULL) == -1) {
+			mdb_warn("can't walk 'ulwp'");
+			return (DCMD_ERR);
+		}
+
+		mdb_printf("\n");
+	}
+
 	for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
-		mdb_printf("%s ", dfp->hdr1);
+		mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->hdr1);
 	mdb_printf("\n");
 
 	for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
-		mdb_printf("%s ", dfp->hdr2);
+		mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->hdr2);
 	mdb_printf("\n");
 
 	for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
-		mdb_printf("%s ", dfp->dashes);
+		mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->dashes);
 	mdb_printf("\n");
 
 	if (mdb_walk("umem_cache", (mdb_walk_cb_t)umastat_cache, &kv) == -1) {
@@ -231,7 +400,7 @@ umastat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	}
 
 	for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
-		mdb_printf("%s ", dfp->dashes);
+		mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->dashes);
 	mdb_printf("\n");
 
 	if (mdb_walk("vmem", (mdb_walk_cb_t)umastat_vmem_totals, kv) == -1) {
diff --git a/usr/src/cmd/mdb/common/modules/libumem/umem.c b/usr/src/cmd/mdb/common/modules/libumem/umem.c
index 26a62c7b52..73dd4d6e89 100644
--- a/usr/src/cmd/mdb/common/modules/libumem/umem.c
+++ b/usr/src/cmd/mdb/common/modules/libumem/umem.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright 2011 Joyent, Inc.  All rights reserved.
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
@@ -36,6 +36,8 @@
 #include <alloca.h>
 #include <limits.h>
 #include <mdb/mdb_whatis.h>
+#include <thr_uberdata.h>
+#include <stdio.h>
 
 #include "misc.h"
 #include "leaky.h"
@@ -104,12 +106,58 @@ umem_update_variables(void)
 	return (0);
 }
 
+static int
+umem_ptc_walk_init(mdb_walk_state_t *wsp)
+{
+	if (wsp->walk_addr == NULL) {
+		if (mdb_layered_walk("ulwp", wsp) == -1) {
+			mdb_warn("couldn't walk 'ulwp'");
+			return (WALK_ERR);
+		}
+	}
+
+	return (WALK_NEXT);
+}
+
+static int
+umem_ptc_walk_step(mdb_walk_state_t *wsp)
+{
+	uintptr_t this;
+	int rval;
+
+	if (wsp->walk_layer != NULL) {
+		this = (uintptr_t)((ulwp_t *)wsp->walk_layer)->ul_self +
+		    (uintptr_t)wsp->walk_arg;
+	} else {
+		this = wsp->walk_addr + (uintptr_t)wsp->walk_arg;
+	}
+
+	for (;;) {
+		if (mdb_vread(&this, sizeof (void *), this) == -1) {
+			mdb_warn("couldn't read ptc buffer at %p", this);
+			return (WALK_ERR);
+		}
+
+		if (this == NULL)
+			break;
+
+		rval = wsp->walk_callback(this, &this, wsp->walk_cbdata);
+
+		if (rval != WALK_NEXT)
+			return (rval);
+	}
+
+	return (wsp->walk_layer != NULL ? WALK_NEXT : WALK_DONE);
+}
+
 /*ARGSUSED*/
 static int
-umem_init_walkers(uintptr_t addr, const umem_cache_t *c, void *ignored)
+umem_init_walkers(uintptr_t addr, const umem_cache_t *c, int *sizes)
 {
 	mdb_walker_t w;
 	char descr[64];
+	char name[64];
+	int i;
 
 	(void) mdb_snprintf(descr, sizeof (descr),
 	    "walk the %s cache", c->cache_name);
@@ -124,6 +172,45 @@ umem_init_walkers(uintptr_t addr, const umem_cache_t *c, void *ignored)
 	if (mdb_add_walker(&w) == -1)
 		mdb_warn("failed to add %s walker", c->cache_name);
 
+	if (!(c->cache_flags & UMF_PTC))
+		return (WALK_NEXT);
+
+	/*
+	 * For the per-thread cache walker, the address is the offset in the
+	 * tm_roots[] array of the ulwp_t.
+	 */
+	for (i = 0; sizes[i] != 0; i++) {
+		if (sizes[i] == c->cache_bufsize)
+			break;
+	}
+
+	if (sizes[i] == 0) {
+		mdb_warn("cache %s is cached per-thread, but could not find "
+		    "size in umem_alloc_sizes\n", c->cache_name);
+		return (WALK_NEXT);
+	}
+
+	if (i >= NTMEMBASE) {
+		mdb_warn("index for %s (%d) exceeds root slots (%d)\n",
+		    c->cache_name, i, NTMEMBASE);
+		return (WALK_NEXT);
+	}
+
+	(void) mdb_snprintf(name, sizeof (name),
+	    "umem_ptc_%d", c->cache_bufsize);
+	(void) mdb_snprintf(descr, sizeof (descr),
+	    "walk the per-thread cache for %s", c->cache_name);
+
+	w.walk_name = name;
+	w.walk_descr = descr;
+	w.walk_init = umem_ptc_walk_init;
+	w.walk_step = umem_ptc_walk_step;
+	w.walk_fini = NULL;
+	w.walk_init_arg = (void *)offsetof(ulwp_t, ul_tmem.tm_roots[i]);
+
+	if (mdb_add_walker(&w) == -1)
+		mdb_warn("failed to add %s walker", w.walk_name);
+
 	return (WALK_NEXT);
 }
 
@@ -132,6 +219,8 @@ static void
 umem_statechange_cb(void *arg)
 {
 	static int been_ready = 0;
+	GElf_Sym sym;
+	int *sizes;
 
 #ifndef _KMDB
 	leaky_cleanup(1);	/* state changes invalidate leaky state */
@@ -147,7 +236,25 @@ umem_statechange_cb(void *arg)
 		return;
 
 	been_ready = 1;
-	(void) mdb_walk("umem_cache", (mdb_walk_cb_t)umem_init_walkers, NULL);
+
+	/*
+	 * In order to determine the tm_roots offset of any cache that is
+	 * cached per-thread, we need to have the umem_alloc_sizes array.
+	 * Read this, assuring that it is zero-terminated.
+	 */
+	if (umem_lookup_by_name("umem_alloc_sizes", &sym) == -1) {
+		mdb_warn("unable to lookup 'umem_alloc_sizes'");
+		return;
+	}
+
+	sizes = mdb_zalloc(sym.st_size + sizeof (int), UM_SLEEP | UM_GC);
+
+	if (mdb_vread(sizes, sym.st_size, (uintptr_t)sym.st_value) == -1) {
+		mdb_warn("couldn't read 'umem_alloc_sizes'");
+		return;
+	}
+
+	(void) mdb_walk("umem_cache", (mdb_walk_cb_t)umem_init_walkers, sizes);
 }
 
 int
@@ -788,9 +895,9 @@ umem_estimate_allocated(uintptr_t addr, const umem_cache_t *cp)
 	} \
 }
 
-int
+static int
 umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
-    void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
+    void ***maglistp, size_t *magcntp, size_t *magmaxp)
 {
 	umem_magazine_t *ump, *mp;
 	void **maglist = NULL;
@@ -807,7 +914,7 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
 		*maglistp = NULL;
 		*magcntp = 0;
 		*magmaxp = 0;
-		return (WALK_NEXT);
+		return (0);
 	}
 
 	/*
@@ -828,11 +935,11 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
 	if (magbsize >= PAGESIZE / 2) {
 		mdb_warn("magazine size for cache %p unreasonable (%x)\n",
 		    addr, magbsize);
-		return (WALK_ERR);
+		return (-1);
 	}
 
-	maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
-	mp = mdb_alloc(magbsize, alloc_flags);
+	maglist = mdb_alloc(magmax * sizeof (void *), UM_SLEEP);
+	mp = mdb_alloc(magbsize, UM_SLEEP);
 	if (mp == NULL || maglist == NULL)
 		goto fail;
 
@@ -875,23 +982,80 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
 
 	dprintf(("magazine layer: %d buffers\n", magcnt));
 
-	if (!(alloc_flags & UM_GC))
-		mdb_free(mp, magbsize);
+	mdb_free(mp, magbsize);
 
 	*maglistp = maglist;
 	*magcntp = magcnt;
 	*magmaxp = magmax;
 
-	return (WALK_NEXT);
+	return (0);
 
 fail:
-	if (!(alloc_flags & UM_GC)) {
-		if (mp)
-			mdb_free(mp, magbsize);
-		if (maglist)
-			mdb_free(maglist, magmax * sizeof (void *));
+	if (mp)
+		mdb_free(mp, magbsize);
+	if (maglist)
+		mdb_free(maglist, magmax * sizeof (void *));
+
+	return (-1);
+}
+
+typedef struct umem_read_ptc_walk {
+	void **urpw_buf;
+	size_t urpw_cnt;
+	size_t urpw_max;
+} umem_read_ptc_walk_t;
+
+/*ARGSUSED*/
+static int
+umem_read_ptc_walk_buf(uintptr_t addr,
+    const void *ignored, umem_read_ptc_walk_t *urpw)
+{
+	if (urpw->urpw_cnt == urpw->urpw_max) {
+		size_t nmax = urpw->urpw_max ? (urpw->urpw_max << 1) : 1;
+		void **new = mdb_zalloc(nmax * sizeof (void *), UM_SLEEP);
+
+		if (nmax > 1) {
+			size_t osize = urpw->urpw_max * sizeof (void *);
+			bcopy(urpw->urpw_buf, new, osize);
+			mdb_free(urpw->urpw_buf, osize);
+		}
+
+		urpw->urpw_buf = new;
+		urpw->urpw_max = nmax;
 	}
-	return (WALK_ERR);
+
+	urpw->urpw_buf[urpw->urpw_cnt++] = (void *)addr;
+
+	return (WALK_NEXT);
+}
+
+static int
+umem_read_ptc(umem_cache_t *cp,
+    void ***buflistp, size_t *bufcntp, size_t *bufmaxp)
+{
+	umem_read_ptc_walk_t urpw;
+	char walk[60];
+	int rval;
+
+	if (!(cp->cache_flags & UMF_PTC))
+		return (0);
+
+	(void) snprintf(walk, sizeof (walk), "umem_ptc_%d", cp->cache_bufsize);
+
+	urpw.urpw_buf = *buflistp;
+	urpw.urpw_cnt = *bufcntp;
+	urpw.urpw_max = *bufmaxp;
+
+	if ((rval = mdb_walk(walk,
+	    (mdb_walk_cb_t)umem_read_ptc_walk_buf, &urpw)) == -1) {
+		mdb_warn("couldn't walk %s", walk);
+	}
+
+	*buflistp = urpw.urpw_buf;
+	*bufcntp = urpw.urpw_cnt;
+	*bufmaxp = urpw.urpw_max;
+
+	return (rval);
 }
 
 static int
@@ -1022,13 +1186,19 @@ umem_walk_init_common(mdb_walk_state_t *wsp, int type)
 	/*
 	 * Read in the contents of the magazine layer
 	 */
-	if (umem_read_magazines(cp, addr, &maglist, &magcnt, &magmax,
-	    UM_SLEEP) == WALK_ERR)
+	if (umem_read_magazines(cp, addr, &maglist, &magcnt, &magmax) != 0)
+		goto out2;
+
+	/*
+	 * Read in the contents of the per-thread caches, if any
+	 */
+	if (umem_read_ptc(cp, &maglist, &magcnt, &magmax) != 0)
 		goto out2;
 
 	/*
-	 * We have all of the buffers from the magazines;  if we are walking
-	 * allocated buffers, sort them so we can bsearch them later.
+	 * We have all of the buffers from the magazines and from the
+	 * per-thread cache (if any);  if we are walking allocated buffers,
+	 * sort them so we can bsearch them later.
 	 */
 	if (type & UM_ALLOCATED)
 		qsort(maglist, magcnt, sizeof (void *), addrcmp);
diff --git a/usr/src/cmd/mdb/intel/amd64/libumem/Makefile b/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
index a8352ff865..704ff65873 100644
--- a/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
+++ b/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
@@ -42,6 +42,7 @@ include ../../../../Makefile.cmd
 include ../../../../Makefile.cmd.64
 
 CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
 CPPFLAGS += -I$(MODSRCS_DIR)
 
 include ../../Makefile.amd64
diff --git a/usr/src/cmd/mdb/intel/ia32/libumem/Makefile b/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
index 2399f51d31..a1ab338f40 100644
--- a/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
+++ b/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
@@ -40,6 +40,7 @@ MODSRCS = \
 
 include ../../../../Makefile.cmd
 
+CPPFLAGS += -I$(SRC)/lib/libc/inc
 CPPFLAGS += -I$(SRC)/lib/libumem/common
 CPPFLAGS += -I$(MODSRCS_DIR)
 
diff --git a/usr/src/cmd/mdb/sparc/v7/libumem/Makefile b/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
index 4553b15eba..906d05d5ea 100644
--- a/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
+++ b/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
@@ -41,6 +41,7 @@ MODSRCS = \
 include ../../../../Makefile.cmd
 
 CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
 CPPFLAGS += -I$(MODSRCS_DIR)
 
 include ../../Makefile.sparcv7
diff --git a/usr/src/cmd/mdb/sparc/v9/libumem/Makefile b/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
index 2cbeb25f5e..09ea0473c6 100644
--- a/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
+++ b/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
@@ -54,6 +54,7 @@ KMOD_SOURCES_DIFFERENT=$(POUND_SIGN)
 include ../../../../Makefile.cmd
 
 CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
 CPPFLAGS += -I$(MODSRCS_DIR)
 
 include ../../../../Makefile.cmd.64
diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile
index 9ddd748eb6..873c2ded87 100644
--- a/usr/src/lib/libc/amd64/Makefile
+++ b/usr/src/lib/libc/amd64/Makefile
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
 #
 # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 # Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
@@ -798,6 +799,7 @@ THREADSOBJS=			\
 	assfail.o		\
 	cancel.o		\
 	door_calls.o		\
+	tmem.o			\
 	pthr_attr.o		\
 	pthr_barrier.o		\
 	pthr_cond.o		\
@@ -1119,6 +1121,7 @@ TIL=				\
 	thread_pool.o		\
 	thrp_unwind.o		\
 	tls.o			\
+	tmem.o			\
 	tsd.o
 
 $(TIL:%=pics/%) := CFLAGS64 += $(LIBCBASE)/threads/amd64.il
diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com
index 31a7bc945f..b21f87a0d6 100644
--- a/usr/src/lib/libc/i386/Makefile.com
+++ b/usr/src/lib/libc/i386/Makefile.com
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2013, Joyent, Inc.  All rights reserved.
 # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 #
 # Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
@@ -840,6 +841,7 @@ THREADSOBJS=			\
 	assfail.o		\
 	cancel.o		\
 	door_calls.o		\
+	tmem.o			\
 	pthr_attr.o		\
 	pthr_barrier.o		\
 	pthr_cond.o		\
@@ -1177,6 +1179,7 @@ TIL=				\
 	thread_pool.o		\
 	tls.o			\
 	tsd.o			\
+	tmem.o			\
 	unwind.o
 
 THREADS_INLINES = $(LIBCBASE)/threads/i386.il
diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h
index 42c08049b2..de0d4a6b05 100644
--- a/usr/src/lib/libc/inc/thr_uberdata.h
+++ b/usr/src/lib/libc/inc/thr_uberdata.h
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
 
 #ifndef _THR_UBERDATA_H
 #define	_THR_UBERDATA_H
@@ -488,6 +491,29 @@ typedef struct {
 #endif	/* _SYSCALL32 */
 
 /*
+ * As part of per-thread caching libumem (ptcumem), we add a small amount to the
+ * thread's uberdata to facilitate it. The tm_roots are the roots of linked
+ * lists which is used by libumem to chain together allocations. tm_size is used
+ * to track the total amount of data stored across those linked lists. For more
+ * information, see libumem's big theory statement.
+ */
+#define	NTMEMBASE	16
+
+typedef struct {
+	size_t		tm_size;
+	void		*tm_roots[NTMEMBASE];
+} tumem_t;
+
+#ifdef _SYSCALL32
+typedef struct {
+	uint32_t	tm_size;
+	caddr32_t	tm_roots[NTMEMBASE];
+} tumem32_t;
+#endif
+
+typedef void (*tmem_func_t)(void *, int);
+
+/*
  * Maximum number of read locks allowed for one thread on one rwlock.
  * This could be as large as INT_MAX, but the SUSV3 test suite would
  * take an inordinately long time to complete.  This is big enough.
@@ -653,6 +679,7 @@ typedef struct ulwp {
 #if defined(sparc)
 	void		*ul_unwind_ret;	/* used only by _ex_clnup_handler() */
 #endif
+	tumem_t		ul_tmem;	/* used only by umem */
 } ulwp_t;
 
 #define	ul_cursig	ul_cp.s.cursig		/* deferred signal number */
@@ -1083,6 +1110,7 @@ typedef struct ulwp32 {
 #if defined(sparc)
 	caddr32_t	ul_unwind_ret;	/* used only by _ex_clnup_handler() */
 #endif
+	tumem32_t	ul_tmem;	/* used only by umem */
 } ulwp32_t;
 
 #define	REPLACEMENT_SIZE32	((size_t)&((ulwp32_t *)NULL)->ul_sigmask)
@@ -1205,6 +1233,7 @@ extern	ulwp_t	*find_lwp(thread_t);
 extern	void	finish_init(void);
 extern	void	update_sched(ulwp_t *);
 extern	void	queue_alloc(void);
+extern	void	tmem_exit(void);
 extern	void	tsd_exit(void);
 extern	void	tsd_free(ulwp_t *);
 extern	void	tls_setup(void);
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index 1cdc95be16..1882a337d5 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -25,6 +25,7 @@
 # Use is subject to license terms.
 #
 # Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc.  All rights reserved.
 # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 # Copyright (c) 2013 Gary Mills
 #
@@ -2903,6 +2904,9 @@ $endif
 	thr_wait_mutator;
 	_thr_wait_mutator;
 	__tls_get_addr;
+	_tmem_get_base;
+	_tmem_get_nentries;
+	_tmem_set_cleanup;
 	tpool_create;
 	tpool_dispatch;
 	tpool_destroy;
diff --git a/usr/src/lib/libc/port/threads/thr.c b/usr/src/lib/libc/port/threads/thr.c
index ae55fbddf5..b5d848449d 100644
--- a/usr/src/lib/libc/port/threads/thr.c
+++ b/usr/src/lib/libc/port/threads/thr.c
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
 
 #include "lint.h"
 #include "thr_uberdata.h"
@@ -771,6 +774,7 @@ _thrp_exit()
 	}
 	lmutex_unlock(&udp->link_lock);
 
+	tmem_exit();		/* deallocate tmem allocations */
 	tsd_exit();		/* deallocate thread-specific data */
 	tls_exit();		/* deallocate thread-local storage */
 	heldlock_exit();	/* deal with left-over held locks */
diff --git a/usr/src/lib/libc/port/threads/tmem.c b/usr/src/lib/libc/port/threads/tmem.c
new file mode 100644
index 0000000000..00203de593
--- /dev/null
+++ b/usr/src/lib/libc/port/threads/tmem.c
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ */
+
+#include "lint.h"
+#include "thr_uberdata.h"
+
+/*
+ * This file implements the private interface with libumem for per-thread
+ * caching umem (ptcumem). For the full details on how tcumem works and how
+ * these functions work, see section 8.4 of the big theory statement in
+ * lib/libumem/common/umem.c.
+ */
+static tmem_func_t tmem_cleanup = NULL;
+
+uintptr_t
+_tmem_get_base(void)
+{
+	return ((uintptr_t)&curthread->ul_tmem - (uintptr_t)curthread);
+}
+
+int
+_tmem_get_nentries(void)
+{
+	return (NTMEMBASE);
+}
+
+void
+_tmem_set_cleanup(tmem_func_t f)
+{
+	tmem_cleanup = f;
+}
+
+/*
+ * This is called by _thrp_exit() to clean up any per-thread allocations that
+ * are still hanging around and haven't been cleaned up.
+ */
+void
+tmem_exit(void)
+{
+	int ii;
+	void *buf, *next;
+	tumem_t *tp = &curthread->ul_tmem;
+
+
+	if (tp->tm_size == 0)
+		return;
+
+	/*
+	 * Since we have something stored here, we need to ensure we declared a
+	 * clean up handler. If we haven't that's broken and our single private
+	 * consumer should be shot.
+	 */
+	if (tmem_cleanup == NULL)
+		abort();
+	for (ii = 0; ii < NTMEMBASE; ii++) {
+		buf = tp->tm_roots[ii];
+		while (buf != NULL) {
+			next = *(void **)buf;
+			tmem_cleanup(buf, ii);
+			buf = next;
+		}
+	}
+}
diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com
index cc6bae0df4..25482d7324 100644
--- a/usr/src/lib/libc/sparc/Makefile.com
+++ b/usr/src/lib/libc/sparc/Makefile.com
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc.  All rights reserved.
 # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 #
 # Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
@@ -875,6 +876,7 @@ THREADSOBJS=			\
 	assfail.o		\
 	cancel.o		\
 	door_calls.o		\
+	tmem.o			\
 	pthr_attr.o		\
 	pthr_barrier.o		\
 	pthr_cond.o		\
diff --git a/usr/src/lib/libc/sparcv9/Makefile.com b/usr/src/lib/libc/sparcv9/Makefile.com
index 54b3258fe9..fe6844273e 100644
--- a/usr/src/lib/libc/sparcv9/Makefile.com
+++ b/usr/src/lib/libc/sparcv9/Makefile.com
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc.  All rights reserved.
 # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 #
 # Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
@@ -822,6 +823,7 @@ THREADSOBJS=			\
 	assfail.o		\
 	cancel.o		\
 	door_calls.o		\
+	tmem.o			\
 	pthr_attr.o		\
 	pthr_barrier.o		\
 	pthr_cond.o		\
diff --git a/usr/src/lib/libumem/Makefile.com b/usr/src/lib/libumem/Makefile.com
index 0e726c5646..61f7e9503d 100644
--- a/usr/src/lib/libumem/Makefile.com
+++ b/usr/src/lib/libumem/Makefile.com
@@ -22,6 +22,8 @@
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
+# Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+#
 
 #
 # The build process for libumem is sightly different from that used by other
@@ -65,10 +67,12 @@ SRCS_standalone = $(OBJECTS_standalone:%.o=../common/%.c)
 
 # Architecture-dependent files common to both versions of libumem
 OBJECTS_common_isadep = \
-	asm_subr.o
+	asm_subr.o \
+	umem_genasm.o	
 
 SRCS_common_isadep = \
-	$(ISASRCDIR)/asm_subr.s
+	$(ISASRCDIR)/asm_subr.s \
+	$(ISASRCDIR)/umem_genasm.c
 
 # Architecture-independent files common to both versions  of libumem
 OBJECTS_common_common = \
@@ -140,6 +144,7 @@ DTS_ERRNO=
 STAND_RENAMED_FUNCS= \
 	atomic_add_64 \
 	atomic_add_32_nv \
+	atomic_swap_64 \
 	snprintf \
 	vsnprintf
 
diff --git a/usr/src/lib/libumem/amd64/umem_genasm.c b/usr/src/lib/libumem/amd64/umem_genasm.c
new file mode 100644
index 0000000000..00cc18ab67
--- /dev/null
+++ b/usr/src/lib/libumem/amd64/umem_genasm.c
@@ -0,0 +1,604 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you find the blocks of assembly that follow confusing and
+ * you're questioning why they exist, please go read section 8 of the umem.c big
+ * theory statement. Next familiarize yourself with the malloc and free
+ * implementations in libumem's malloc.c.
+ *
+ * What follows is the amd64 implementation of the thread caching automatic
+ * assembly generation. The amd64 calling conventions are documented in the
+ * 64-bit System V ABI. For our purposes what matters is that our first argument
+ * will come in rdi. Our functions have to preserve rbp, rbx, and r12->r15. We
+ * are free to do whatever we want with rax, rcx, rdx, rsi, rdi, and r8->r11.
+ *
+ * For both our implementation of malloc and free we only use the registers we
+ * don't have to preserve.
+ *
+ * Malloc register usage:
+ * 	o. rdi: Original size to malloc. This never changes and is preserved.
+ * 	o. rsi: Adjusted malloc size for malloc_data_tag(s).
+ * 	o. rcx: Pointer to the tmem_t in the ulwp_t.
+ * 	o. rdx: Pointer to the tmem_t array of roots
+ * 	o. r8:  Size of the cache
+ * 	o. r9:  Scratch register
+ *
+ * Free register usage:
+ *	o. rdi: Original buffer to free. This never changes and is preserved.
+ *	o. rax: The actual buffer, adjusted for the hidden malloc_data_t(s).
+ * 	o. rcx: Pointer to the tmem_t in the ulwp_t.
+ * 	o. rdx: Pointer to the tmem_t array of roots
+ * 	o. r8:  Size of the cache
+ * 	o. r9:  Scratch register
+ *
+ * Once we determine what cache we are using, we increment %rdx to the
+ * appropriate offset and set %r8 with the size of the cache. This means that
+ * when we break out to the normal buffer allocation point %rdx contains the
+ * head of the linked list and %r8 is the amount that we have to adjust the
+ * thread's cached amount by.
+ *
+ * Each block of assembly has psuedocode that describes its purpose.
+ */
+
+#include <atomic.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <strings.h>
+#include <umem_impl.h>
+#include "umem_base.h"
+
+#include <stdio.h>
+
+const int umem_genasm_supported = 1;
+static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
+static size_t umem_genasm_msize = 576;
+static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
+static size_t umem_genasm_fsize = 576;
+static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
+static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
+
+#define	UMEM_GENASM_MAX64	(UINT32_MAX / sizeof (uintptr_t))
+#define	PTC_JMPADDR(dest, src)	(dest - (src + 4))
+#define	PTC_ROOT_SIZE	sizeof (uintptr_t)
+#define	MULTINOP	0x0000441f0f
+
+/*
+ * void *ptcmalloc(size_t orig_size);
+ *
+ * size_t size = orig_size + 8;
+ * if (size > UMEM_SECOND_ALIGN)
+ * 	size += 8;
+ *
+ * if (size < orig_size)
+ * 	goto tomalloc;		! This is overflow
+ *
+ * if (size > cache_max)
+ * 	goto tomalloc
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define	PTC_MALINIT_JOUT	0x13
+#define	PTC_MALINIT_MCS	0x1a
+#define	PTC_MALINIT_JOV	0x20
+#define	PTC_MALINIT_SOFF	0x30
+static const uint8_t malinit[] =  {
+	0x48, 0x8d, 0x77, 0x08,		/* leaq 0x8(%rdi),%rsi */
+	0x48, 0x83, 0xfe, 0x10,		/* cmpq $0x10, %rsi */
+	0x76, 0x04,			/* jbe +0x4 */
+	0x48, 0x8d, 0x77, 0x10,		/* leaq 0x10(%rdi),%rsi */
+	0x48, 0x39, 0xfe,		/* cmpq %rdi,%rsi */
+	0x0f, 0x82, 0x00, 0x00, 0x00, 0x00,	/* jb +errout */
+	0x48, 0x81, 0xfe,
+	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
+	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +errout */
+	0x64, 0x48, 0x8b, 0x0c, 0x25,
+	0x00, 0x00, 0x00, 0x00,		/* movq %fs:0x0,%rcx */
+	0x48, 0x81, 0xc1,
+	0x00, 0x00, 0x00, 0x00,		/* addq $SOFF, %rcx */
+	0x48, 0x8d, 0x51, 0x08,		/* leaq 0x8(%rcx),%rdx */
+};
+
+/*
+ * void ptcfree(void *buf);
+ *
+ * if (buf == NULL)
+ * 	return;
+ *
+ * malloc_data_t *tag = buf;
+ * tag--;
+ * int size = tag->malloc_size;
+ * int tagval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
+ * if (tagval == MALLOC_SECOND_MAGIC) {
+ * 	tag--;
+ * } else if (tagval != MALLOC_MAGIC) {
+ * 	goto tofree;
+ * }
+ *
+ * if (size > cache_max)
+ * 	goto tofree;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define	PTC_FRINI_JDONE	0x05
+#define	PTC_FRINI_JFREE	0x25
+#define	PTC_FRINI_MCS	0x30
+#define	PTC_FRINI_JOV	0x36
+#define	PTC_FRINI_SOFF	0x46
+static const uint8_t freeinit[] = {
+	0x48, 0x85, 0xff,		/* testq %rdi,%rdi */
+	0x0f, 0x84, 0x00, 0x00, 0x00, 0x00,	/* jmp $JDONE (done) */
+	0x8b, 0x77, 0xf8,		/* movl -0x8(%rdi),%esi */
+	0x8b, 0x47, 0xfc,		/* movl -0x4(%rdi),%eax */
+	0x01, 0xf0,			/* addl %esi,%eax */
+	0x3d, 0x00, 0x70, 0xba, 0x16,	/* cmpl $MALLOC_2_MAGIC, %eax */
+	0x75, 0x06,			/* jne +0x6 (checkover) */
+	0x48, 0x8d, 0x47, 0xf0,		/* leaq -0x10(%rdi),%eax */
+	0xeb, 0x0f,			/* jmp +0xf (freebuf) */
+	0x3d, 0x00, 0xc0, 0x10, 0x3a,	/* cmpl $MALLOC_MAGIC, %eax */
+	0x0f, 0x85, 0x00, 0x00, 0x00, 0x00,	/* jmp +JFREE (goto torfree) */
+	0x48, 0x8d, 0x47, 0xf8,		/* leaq -0x8(%rdi),%rax */
+	0x48, 0x81, 0xfe,
+	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
+	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +errout */
+	0x64, 0x48, 0x8b, 0x0c, 0x25,
+	0x00, 0x00, 0x00, 0x00,		/* movq %fs:0x0,%rcx */
+	0x48, 0x81, 0xc1,
+	0x00, 0x00, 0x00, 0x00,		/* addq $SOFF, %rcx */
+	0x48, 0x8d, 0x51, 0x08,		/* leaq 0x8(%rcx),%rdx */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ * } else ...				! goto next cache
+ */
+#define	PTC_INICACHE_CMP	0x03
+#define	PTC_INICACHE_SIZE	0x0c
+#define	PTC_INICACHE_JMP	0x11
+static const uint8_t inicache[] = {
+	0x48, 0x81, 0xfe,
+	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
+	0x77, 0x0c,			/* ja +0xc (next cache) */
+	0x49, 0xc7, 0xc0,
+	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
+	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp $JMP (allocbuf) */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ *	roots += $CACHE_NUM;
+ * } else ...				! goto next cache
+ */
+#define	PTC_GENCACHE_CMP	0x03
+#define	PTC_GENCACHE_SIZE	0x0c
+#define	PTC_GENCACHE_NUM	0x13
+#define	PTC_GENCACHE_JMP	0x18
+static const uint8_t gencache[] = {
+	0x48, 0x81, 0xfe,
+	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
+	0x77, 0x14,			/* ja +0xc (next cache) */
+	0x49, 0xc7, 0xc0,
+	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
+	0x48, 0x81, 0xc2,
+	0x00, 0x00, 0x00, 0x00,		/* addq $8*ii, %rdx */
+	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp +$JMP (allocbuf ) */
+};
+
+/*
+ * else if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ *	roots += $CACHE_NUM;
+ * } else {
+ *	goto tofunc; 			! goto tomalloc if ptcmalloc.
+ * }					! goto tofree if ptcfree.
+ */
+#define	PTC_FINCACHE_CMP	0x03
+#define	PTC_FINCACHE_JMP	0x08
+#define	PTC_FINCACHE_SIZE	0x0c
+#define	PTC_FINCACHE_NUM	0x13
+static const uint8_t fincache[] = {
+	0x48, 0x81, 0xfe,
+	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
+	0x77, 0x00,			/* ja +JMP (to real malloc) */
+	0x49, 0xc7, 0xc0,
+	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
+	0x48, 0x81, 0xc2,
+	0x00, 0x00, 0x00, 0x00,		/* addq $8*ii, %rdx */
+
+};
+
+/*
+ * if (*root == NULL)
+ * 	goto tomalloc;
+ *
+ * malloc_data_t *ret = *root;
+ * *root = *(void **)ret;
+ * t->tm_size += csize;
+ * ret->malloc_size = size;
+ *
+ * if (size > UMEM_SECOND_ALIGN) {
+ *	ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ *	ret += 2;
+ * } else {
+ *	ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ *	ret += 1;
+ * }
+ *
+ * return ((void *)ret);
+ * tomalloc:
+ * 	return (malloc(orig_size));
+ */
+#define	PTC_MALFINI_ALLABEL	0x00
+#define	PTC_MALFINI_JMLABEL	0x40
+#define	PTC_MALFINI_JMADDR	0x41
+static const uint8_t malfini[] = {
+	0x48, 0x8b, 0x02,		/* movl (%rdx),%rax */
+	0x48, 0x85, 0xc0,		/* testq %rax,%rax */
+	0x74, 0x38,			/* je +0x38 (errout) */
+	0x4c, 0x8b, 0x08,		/* movq (%rax),%r9 */
+	0x4c, 0x89, 0x0a,		/* movq %r9,(%rdx) */
+	0x4c, 0x29, 0x01,		/* subq %rsi,(%rcx) */
+	0x48, 0x83, 0xfe, 0x10,		/* cmpq $0x10,%rsi */
+	0x76, 0x15,			/* jbe +0x15 */
+	0x41, 0xb9, 0x00, 0x70, 0xba, 0x16, /* movl $MALLOC_MAGIC_2, %r9d */
+	0x89, 0x70, 0x08,		/* movl %r9d,0x8(%rax) */
+	0x41, 0x29, 0xf1,		/* subl %esi, %r9d */
+	0x44, 0x89, 0x48, 0x0c,		/* movl %r9d, 0xc(%rax) */
+	0x48, 0x83, 0xc0, 0x10,		/* addq $0x10, %rax */
+	0xc3,				/* ret */
+	0x41, 0xb9, 0x00, 0xc0, 0x10, 0x3a,	/* movl %MALLOC_MAGIC, %r9d */
+	0x89, 0x30,			/* movl %esi,(%rax) */
+	0x41, 0x29, 0xf1,		/* subl %esi,%r9d */
+	0x44, 0x89, 0x48, 0x04,		/* movl %r9d,0x4(%rax) */
+	0x48, 0x83, 0xc0, 0x08,		/* addq $0x8,%rax */
+	0xc3,				/* ret */
+	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp $MALLOC */
+};
+
+/*
+ * if (t->tm_size + csize > umem_ptc_size)
+ * 	goto tofree;
+ *
+ * t->tm_size += csize
+ * *(void **)tag = *root;
+ * *root = tag;
+ * return;
+ * tofree:
+ * 	free(buf);
+ * 	return;
+ */
+#define	PTC_FRFINI_RBUFLABEL	0x00
+#define	PTC_FRFINI_CACHEMAX	0x09
+#define	PTC_FRFINI_DONELABEL	0x1b
+#define	PTC_FRFINI_JFLABEL	0x1c
+#define	PTC_FRFINI_JFADDR	0x1d
+static const uint8_t freefini[] = {
+	0x4c, 0x8b, 0x09,		/* movq (%rcx),%r9 */
+	0x4d, 0x01, 0xc1,		/* addq %r8, %r9 */
+	0x49, 0x81, 0xf9,
+	0x00, 0x00, 0x00, 0x00,		/* cmpl $THR_CACHE_MAX, %r9 */
+	0x77, 0x0d,			/* jae +0xd (torfree) */
+	0x4c, 0x01, 0x01,		/* addq %r8,(%rcx) */
+	0x4c, 0x8b, 0x0a,		/* movq (%rdx),%r9 */
+	0x4c, 0x89, 0x08,		/* movq %r9,(%rax) */
+	0x48, 0x89, 0x02,		/* movq %rax,(%rdx) */
+	0xc3,				/* ret */
+	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp free */
+};
+
+/*
+ * Construct the initial part of malloc. off contains the offset from curthread
+ * to the root of the tmem structure. ep is the address of the label to error
+ * and jump to free. csize is the size of the largest umem_cache in ptcumem.
+ */
+static int
+genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
+{
+	uint32_t addr;
+
+	bcopy(malinit, bp, sizeof (malinit));
+	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
+	bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
+	bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
+	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
+	bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
+	bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
+
+	return (sizeof (malinit));
+}
+
+static int
+genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mcs)
+{
+	uint32_t addr;
+
+	bcopy(freeinit, bp, sizeof (freeinit));
+	addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
+	bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
+	addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
+	bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
+	bcopy(&mcs, bp + PTC_FRINI_MCS, sizeof (mcs));
+	addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
+	bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
+	bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
+	return (sizeof (freeinit));
+}
+
+
+/*
+ * Create the initial cache entry of the specified size. The value of ap tells
+ * us what the address of the label to try and allocate a buffer. This value is
+ * an offset from the current base to that value.
+ */
+static int
+genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
+{
+	uint32_t addr;
+
+	bcopy(inicache, bp, sizeof (inicache));
+	bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
+	addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
+	ASSERT(addr != 0);
+	bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
+
+	return (sizeof (inicache));
+}
+
+static int
+genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
+{
+	uint32_t addr;
+	uint32_t coff;
+
+	ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
+	ASSERT(num != 0);
+	bcopy(gencache, bp, sizeof (gencache));
+	bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
+	coff = num * PTC_ROOT_SIZE;
+	bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
+	addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
+	bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
+
+	return (sizeof (gencache));
+}
+
+static int
+genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
+{
+	uint8_t eap;
+	uint32_t coff;
+
+	ASSERT(ep <= 0xff && ep > 7);
+	ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
+	bcopy(fincache, bp, sizeof (fincache));
+	bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
+	coff = num * PTC_ROOT_SIZE;
+	bcopy(&coff, bp + PTC_FINCACHE_NUM, sizeof (coff));
+	eap = ep - PTC_FINCACHE_JMP - 1;
+	bcopy(&eap, bp + PTC_FINCACHE_JMP, sizeof (eap));
+
+	return (sizeof (fincache));
+}
+
+static int
+genasm_malfini(uint8_t *bp, uintptr_t mptr)
+{
+	uint32_t addr;
+
+	bcopy(malfini, bp, sizeof (malfini));
+	addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
+	bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
+
+	return (sizeof (malfini));
+}
+
+static int
+genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
+{
+	uint32_t addr;
+
+	bcopy(freefini, bp, sizeof (freefini));
+	bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
+	addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
+	bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
+
+	return (sizeof (freefini));
+}
+
+/*
+ * The malloc inline assembly is constructed as follows:
+ *
+ * o Malloc prologue assembly
+ * o Generic first-cache check
+ * o n Generic cache checks (where n = _tmem_get_entries() - 2)
+ * o Generic last-cache check
+ * o Malloc epilogue assembly
+ *
+ * Generally there are at least three caches. When there is only one cache we
+ * only use the generic last-cache. In the case where there are two caches, we
+ * just leave out the middle ones.
+ */
+static int
+genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+	int ii, off;
+	uint8_t *bp;
+	size_t total;
+	uint32_t allocoff, erroff;
+
+	total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
+
+	if (nents >= 2)
+		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+	if (total > len)
+		return (1);
+
+	erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
+	allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
+
+	bp = base;
+
+	off = genasm_malinit(bp, umem_tmem_off, erroff,
+	    umem_alloc_sizes[nents-1]);
+	bp += off;
+	allocoff -= off;
+	erroff -= off;
+
+	if (nents > 1) {
+		off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
+		bp += off;
+		allocoff -= off;
+		erroff -= off;
+	}
+
+	for (ii = 1; ii < nents - 1; ii++) {
+		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
+		bp += off;
+		allocoff -= off;
+		erroff -= off;
+	}
+
+	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+	    erroff);
+	bp += genasm_malfini(bp, umem_genasm_omptr);
+	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+	return (0);
+}
+
+static int
+genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+	uint8_t *bp;
+	int ii, off;
+	size_t total;
+	uint32_t rbufoff, retoff, erroff;
+
+	/* Assume that nents has already been audited for us */
+	total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
+	if (nents >= 2)
+		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+	if (total > len)
+		return (1);
+
+	erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
+	rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
+	retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
+
+	bp = base;
+
+	off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
+	    umem_alloc_sizes[nents - 1]);
+	bp += off;
+	erroff -= off;
+	rbufoff -= off;
+
+	if (nents > 1) {
+		off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
+		bp += off;
+		erroff -= off;
+		rbufoff -= off;
+	}
+
+	for (ii = 1; ii < nents - 1; ii++) {
+		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
+		bp += off;
+		rbufoff -= off;
+		erroff -= off;
+	}
+
+	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+	    erroff);
+	bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
+	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+umem_genasm(int *cp, umem_cache_t **caches, int nc)
+{
+	int nents, i;
+	uint8_t *mptr;
+	uint8_t *fptr;
+	uint64_t v, *vptr;
+
+	mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
+	fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
+	if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
+	    umem_genasm_fptr == 0 || umem_genasm_fsize == 0)
+		return (1);
+
+	/*
+	 * The total number of caches that we can service is the minimum of:
+	 *  o the amount supported by libc
+	 *  o the total number of umem caches
+	 *  o we use a single byte addl, so it's MAX_UINT32 / sizeof (uintptr_t)
+	 *    For 64-bit, this is MAX_UINT32 >> 3, a lot.
+	 */
+	nents = _tmem_get_nentries();
+
+	if (UMEM_GENASM_MAX64 < nents)
+		nents = UMEM_GENASM_MAX64;
+
+	if (nc < nents)
+		nents = nc;
+
+	/* Based on our constraints, this is not an error */
+	if (nents == 0 || umem_ptc_size == 0)
+		return (0);
+
+	/* Take into account the jump */
+	if (genasm_malloc(mptr, umem_genasm_msize, nents, cp) != 0)
+		return (1);
+
+	if (genasm_free(fptr, umem_genasm_fsize, nents, cp) != 0)
+		return (1);
+
+
+	/* nop out the jump with a multibyte jump */
+	vptr = (void *)umem_genasm_mptr;
+	v = MULTINOP;
+	v |= *vptr & (0xffffffULL << 40);
+	(void) atomic_swap_64(vptr, v);
+	vptr = (void *)umem_genasm_fptr;
+	v = MULTINOP;
+	v |= *vptr & (0xffffffULL << 40);
+	(void) atomic_swap_64(vptr, v);
+
+	for (i = 0; i < nents; i++)
+		caches[i]->cache_flags |= UMF_PTC;
+
+	return (0);
+}
diff --git a/usr/src/lib/libumem/common/envvar.c b/usr/src/lib/libumem/common/envvar.c
index fc3d490a01..0c4d872814 100644
--- a/usr/src/lib/libumem/common/envvar.c
+++ b/usr/src/lib/libumem/common/envvar.c
@@ -22,7 +22,10 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
 #include <ctype.h>
@@ -151,7 +154,10 @@ static umem_env_item_t umem_options_items[] = {
 		NULL, 0, NULL,	&vmem_sbrk_pagesize
 	},
 #endif
-
+	{ "perthread_cache",	"Evolving",	ITEM_SIZE,
+		"Size (in bytes) of per-thread allocation cache",
+		NULL, 0, NULL, &umem_ptc_size
+	},
 	{ NULL, "-- end of UMEM_OPTIONS --",	ITEM_INVALID }
 };
 
diff --git a/usr/src/lib/libumem/common/linktest_stand.c b/usr/src/lib/libumem/common/linktest_stand.c
index 8ae9fdbec8..dd8333828b 100644
--- a/usr/src/lib/libumem/common/linktest_stand.c
+++ b/usr/src/lib/libumem/common/linktest_stand.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file is used to verify that the standalone's external dependencies
  * haven't changed in a way that'll break things that use it.
@@ -34,6 +32,7 @@
 void __umem_assert_failed(void) {}
 void atomic_add_64(void) {}
 void atomic_add_32_nv(void) {}
+void atomic_swap_64(void) {}
 void dladdr1(void) {}
 void bcopy(void) {}
 void bzero(void) {}
diff --git a/usr/src/lib/libumem/common/malloc.c b/usr/src/lib/libumem/common/malloc.c
index 906f369d29..3d19e5b320 100644
--- a/usr/src/lib/libumem/common/malloc.c
+++ b/usr/src/lib/libumem/common/malloc.c
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
@@ -50,8 +48,17 @@ typedef struct malloc_data {
 	uint32_t malloc_stat; /* = UMEM_MALLOC_ENCODE(state, malloc_size) */
 } malloc_data_t;
 
+/*
+ * Because we do not support ptcumem on non-x86 today, we have to create these
+ * weak aliases.
+ */
+#ifndef _x86
+#pragma weak malloc = umem_malloc
+#pragma weak free = umem_malloc_free
+#endif /* !_x86 */
+
 void *
-malloc(size_t size_arg)
+umem_malloc(size_t size_arg)
 {
 #ifdef _LP64
 	uint32_t high_size = 0;
@@ -369,7 +376,7 @@ process_memalign:
 }
 
 void
-free(void *buf)
+umem_malloc_free(void *buf)
 {
 	if (buf == NULL)
 		return;
diff --git a/usr/src/lib/libumem/common/mapfile-vers b/usr/src/lib/libumem/common/mapfile-vers
index 102bd989f7..888a1570f2 100644
--- a/usr/src/lib/libumem/common/mapfile-vers
+++ b/usr/src/lib/libumem/common/mapfile-vers
@@ -20,6 +20,7 @@
 #
 #
 # Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc.  All rights reserved.
 #
 
 #
@@ -38,6 +39,17 @@
 
 $mapfile_version 2
 
+$if _x86
+LOAD_SEGMENT umem {
+	FLAGS = READ WRITE EXECUTE;
+	ASSIGN_SECTION {
+		IS_NAME = .text;
+		FILE_BASENAME = asm_subr.o
+	};
+};
+$endif
+
+
 SYMBOL_VERSION SUNW_1.1 {
     global:
 	calloc			{ FLAGS = NODIRECT };
diff --git a/usr/src/lib/libumem/common/stub_stand.c b/usr/src/lib/libumem/common/stub_stand.c
index 54635558c3..2c82364ef1 100644
--- a/usr/src/lib/libumem/common/stub_stand.c
+++ b/usr/src/lib/libumem/common/stub_stand.c
@@ -23,6 +23,9 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ */
 
 /*
  * Stubs for the standalone to reduce the dependence on external libraries
@@ -125,3 +128,21 @@ issetugid(void)
 {
 	return (1);
 }
+
+int
+_tmem_get_nentries(void)
+{
+	return (0);
+}
+
+uintptr_t
+_tmem_get_base(void)
+{
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+_tmem_set_cleanup(void (*f)(int, void *))
+{
+}
diff --git a/usr/src/lib/libumem/common/umem.c b/usr/src/lib/libumem/common/umem.c
index 9ee030dd47..00028e5f80 100644
--- a/usr/src/lib/libumem/common/umem.c
+++ b/usr/src/lib/libumem/common/umem.c
@@ -21,11 +21,14 @@
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Copyright 2012 Joyent, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+/*
  * based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
  *
  * The slab allocator, as described in the following two papers:
@@ -43,7 +46,7 @@
  *
  * 1. Overview
  * -----------
- * umem is very close to kmem in implementation.  There are four major
+ * umem is very close to kmem in implementation.  There are seven major
  * areas of divergence:
  *
  *	* Initialization
@@ -56,6 +59,10 @@
  *
  *	* lock ordering
  *
+ *	* changing UMEM_MAXBUF
+ *
+ *	* Per-thread caching for malloc/free
+ *
  * 2. Initialization
  * -----------------
  * kmem is initialized early on in boot, and knows that no one will call
@@ -365,6 +372,232 @@
  *
  * The second place to update, which is not required, is the umem_alloc_sizes.
  * These determine the default cache sizes that we're going to support.
+ *
+ * 8. Per-thread caching for malloc/free
+ * -------------------------------------
+ *
+ * "Time is an illusion. Lunchtime doubly so." -- Douglas Adams
+ *
+ * Time may be an illusion, but CPU cycles aren't.  While libumem is designed
+ * to be a highly scalable allocator, that scalability comes with a fixed cycle
+ * penalty even in the absence of contention: libumem must acquire (and release
+ * a per-CPU lock for each allocation.  When contention is low and malloc(3C)
+ * frequency is high, this overhead can dominate execution time.  To alleviate
+ * this, we allow for per-thread caching, a lock-free means of caching recent
+ * deallocations on a per-thread basis for use in satisfying subsequent calls
+ *
+ * In addition to improving performance, we also want to:
+ *	* Minimize fragmentation
+ *	* Not add additional memory overhead (no larger malloc tags)
+ *
+ * In the ulwp_t of each thread there is a private data structure called a
+ * umem_t that looks like:
+ *
+ * typedef struct {
+ * 	size_t	tm_size;
+ * 	void	*tm_roots[NTMEMBASE];  (Currently 16)
+ * } tmem_t;
+ *
+ * Each of the roots is treated as the head of a linked list. Each entry in the
+ * list can be thought of as a void ** which points to the next entry, until one
+ * of them points to NULL. If the head points to NULL, the list is empty.
+ *
+ * Each head corresponds to a umem_cache. Currently there is a linear mapping
+ * where the first root corresponds to the first cache, second root to the
+ * second cache, etc. This works because every allocation that malloc makes to
+ * umem_alloc that can be satisified by a umem_cache will actually return a
+ * number of bytes equal to the size of that cache. Because of this property and
+ * a one to one mapping between caches and roots we can guarantee that every
+ * entry in a given root's list will be able to satisfy the same requests as the
+ * corresponding cache.
+ *
+ * The choice of sixteen roots is based on where we believe we get the biggest
+ * bang for our buck. The per-thread caches will cache up to 256 byte and 448
+ * byte allocations on ILP32 and LP64 respectively. Generally applications plan
+ * more carefully how they do larger allocations than smaller ones. Therefore
+ * sixteen roots is a reasonable compromise between the amount of additional
+ * overhead per thread, and the likelihood of a program to benefit from it.
+ *
+ * The maximum amount of memory that can be cached in each thread is determined
+ * by the perthread_cache UMEM_OPTION. It corresponds to the umem_ptc_size
+ * value. The default value for this is currently 1 MB. Once umem_init() has
+ * finished this cannot be directly tuned without directly modifying the
+ * instruction text. If, upon calling free(3C), the amount cached would exceed
+ * this maximum, we instead actually return the buffer to the umem_cache instead
+ * of holding onto it in the thread.
+ *
+ * When a thread calls malloc(3C) it first determines which umem_cache it
+ * would be serviced by. If the allocation is not covered by ptcumem it goes to
+ * the normal malloc instead.  Next, it checks if the tmem_root's list is empty
+ * or not. If it is empty, we instead go and allocate the memory from
+ * umem_alloc. If it is not empty, we remove the head of the list, set the
+ * appropriate malloc tags, and return that buffer.
+ *
+ * When a thread calls free(3C) it first looks at the malloc tag and if it is
+ * invalid or the allocation exceeds the largest cache in ptcumem and sends it
+ * off to the original free() to handle and clean up appropriately. Next, it
+ * checks if the allocation size is covered by one of the per-thread roots and
+ * if it isn't, it passes it off to the original free() to be released. Finally,
+ * before it inserts this buffer as the head, it checks if adding this buffer
+ * would put the thread over its maximum cache size. If it would, it frees the
+ * buffer back to the umem_cache. Otherwise it increments the threads total
+ * cached amount and makes the buffer the new head of the appropriate tm_root.
+ *
+ * When a thread exits, all of the buffers that it has in its per-thread cache
+ * will be passed to umem_free() and returned to the appropriate umem_cache.
+ *
+ * 8.1 Handling addition and removal of umem_caches
+ * ------------------------------------------------
+ *
+ * The set of umem_caches that are used to back calls to umem_alloc() and
+ * ultimately malloc() are determined at program execution time. The default set
+ * of caches is defined below in umem_alloc_sizes[]. Various umem_options exist
+ * that modify the set of caches: size_add, size_clear, and size_remove. Because
+ * the set of caches can only be determined once umem_init() has been called and
+ * we have the additional goals of minimizing additional fragmentation and
+ * metadata space overhead in the malloc tags, this forces our hand to go down a
+ * slightly different path: the one tread by fasttrap and trapstat.
+ *
+ * During umem_init we're going to dynamically construct a new version of
+ * malloc(3C) and free(3C) that utilizes the known cache sizes and then ensure
+ * that ptcmalloc and ptcfree replace malloc and free as entries in the plt. If
+ * ptcmalloc and ptcfree cannot handle a request, they simply jump to the
+ * original libumem implementations.
+ *
+ * After creating all of the umem_caches, but before making them visible,
+ * umem_cache_init checks that umem_genasm_supported is non-zero. This value is
+ * set by each architecture in $ARCH/umem_genasm.c to indicate whether or not
+ * they support this. If the value is zero, then this process is skipped.
+ * Similarly, if the cache size has been tuned to zero by UMEM_OPTIONS, then
+ * this is also skipped.
+ *
+ * In umem_genasm.c, each architecture's implementation implements a single
+ * function called umem_genasm() that is responsible for generating the
+ * appropriate versions of ptcmalloc() and ptcfree(), placing them in the
+ * appropriate memory location, and finally doing the switch from malloc() and
+ * free() to ptcmalloc() and ptcfree().  Once the change has been made, there is
+ * no way to switch back, short of restarting the program or modifying program
+ * text with mdb.
+ *
+ * 8.2 Modifying the Procedure Linkage Table (PLT)
+ * -----------------------------------------------
+ *
+ * The last piece of this puzzle is how we actually jam ptcmalloc() into the
+ * PLT.  To handle this, we have defined two functions, _malloc and _free and
+ * used a special mapfile directive to place them into the a readable,
+ * writeable, and executable segment.  Next we use a standard #pragma weak for
+ * malloc and free and direct them to those symbols. By default, those symbols
+ * have text defined as nops for our generated functions and when they're
+ * invoked, they jump to the default malloc and free functions.
+ *
+ * When umem_genasm() is called, it goes through and generates new malloc() and
+ * free() functions in the text provided for by _malloc and _free just after the
+ * jump. Once both have been successfully generated, umem_genasm() nops over the
+ * original jump so that we now call into the genasm versions of these
+ * functions.
+ *
+ * 8.3 umem_genasm()
+ * -----------------
+ *
+ * umem_genasm() is currently implemented for i386 and amd64. This section
+ * describes the theory behind the construction. For specific byte code to
+ * assembly instructions and niceish C and asm versions of ptcmalloc and
+ * ptcfree, see the individual umem_genasm.c files. The layout consists of the
+ * following sections:
+ *
+ *	o. function-specfic prologue
+ *	o. function-generic cache-selecting elements
+ *	o. function-specific epilogue
+ *
+ * There are three different generic cache elements that exist:
+ *
+ *	o. the last or only cache
+ *	o. the intermediary caches if more than two
+ *	o. the first one if more than one cache
+ *
+ * The malloc and free prologues and epilogues mimic the necessary portions of
+ * libumem's malloc and free. This includes things like checking for size
+ * overflow, setting and verifying the malloc tags.
+ *
+ * It is an important constraint that these functions do not make use of the
+ * call instruction. The only jmp outside of the individual functions is to the
+ * original libumem malloc and free respectively. Because doing things like
+ * setting errno or raising an internal umem error on improper malloc tags would
+ * require using calls into the PLT, whenever we encounter one of those cases we
+ * just jump to the original malloc and free functions reusing the same stack
+ * frame.
+ *
+ * Each of the above sections, the three caches, and the malloc and free
+ * prologue and epilogue are implemented as blocks of machine code with the
+ * corresponding assembly in comments. There are known offsets into each block
+ * that corresponds to locations of data and addresses that we only know at run
+ * time. These blocks are copied as necessary and the blanks filled in
+ * appropriately.
+ *
+ * As mentioned in section 8.2, the trampoline library uses specifically named
+ * variables to communicate the buffers and size to use. These variables are:
+ *
+ *	o. umem_genasm_mptr: The buffer for ptcmalloc
+ *	o. umem_genasm_msize: The size in bytes of the above buffer
+ *	o. umem_genasm_fptr: The buffer for ptcfree
+ *	o. umem_genasm_fsize: The size in bytes of the above buffer
+ *
+ * Finally, to enable the generated assembly we need to remove the previous jump
+ * to the actual malloc that exists at the start of these buffers. On x86, this
+ * is a five byte region. We could zero out the jump offset to be a jmp +0, but
+ * using nops can be faster. We specifically use a single five byte nop on x86
+ * as it is faster. When porting ptcumem to other architectures, the various
+ * opcode changes and options should be analyzed.
+ *
+ * 8.4 Interface with libc.so
+ * --------------------------
+ *
+ * The tmem_t structure as described in the beginning of section 8, is part of a
+ * private interface with libc. There are three functions that exist to cover
+ * this. They are not documented in man pages or header files. They are in the
+ * SUNWprivate part of libc's mapfile.
+ *
+ *	o. _tmem_get_base(void)
+ *
+ * 	Returns the offset from the ulwp_t (curthread) to the tmem_t structure.
+ * 	This is a constant for all threads and is effectively a way to to do
+ * 	::offsetof ulwp_t ul_tmem without having to know the specifics of the
+ * 	structure outside of libc.
+ *
+ *	o. _tmem_get_nentries(void)
+ *
+ *	Returns the number of roots that exist in the tmem_t. This is one part
+ *	of the cap on the number of umem_caches that we can back with tmem.
+ *
+ *	o. _tmem_set_cleanup(void (*)(void *, int))
+ *
+ *	This sets a clean up handler that gets called back when a thread exits.
+ *	There is one call per buffer, the void * is a pointer to the buffer on
+ *	the list, the int is the index into the roots array for this buffer.
+ *
+ * 8.5 Tuning and disabling per-thread caching
+ * -------------------------------------------
+ *
+ * There is only one tunable for per-thread caching:  the amount of memory each
+ * thread should be able to cache.  This is specified via the perthread_cache
+ * UMEM_OPTION option.  No attempt is made to to sanity check the specified
+ * value; the limit is simply the maximum value of a size_t.
+ *
+ * If the perthread_cache UMEM_OPTION is set to zero, nomagazines was requested,
+ * or UMEM_DEBUG has been turned on then we will never call into umem_genasm;
+ * however, the trampoline audit library and jump will still be in place.
+ *
+ * 8.6 Observing efficacy of per-thread caching
+ * --------------------------------------------
+ *
+ * To understand the efficacy of per-thread caching, use the ::umastat dcmd
+ * to see the percentage of capacity consumed on a per-thread basis, the
+ * degree to which each umem cache contributes to per-thread cache consumption,
+ * and the number of buffers in per-thread caches on a per-umem cache basis.
+ * If more detail is required, the specific buffers in a per-thread cache can
+ * be iterated over with the umem_ptc_* walkers. (These walkers allow an
+ * optional ulwp_t to be specified to iterate only over a particular thread's
+ * cache.)
  */
 
 #include <umem_impl.h>
@@ -473,8 +706,10 @@ size_t umem_lite_minsize = 0;	/* minimum buffer size for UMF_LITE */
 size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */
 size_t umem_maxverify;		/* maximum bytes to inspect in debug routines */
 size_t umem_minfirewall;	/* hardware-enforced redzone threshold */
+size_t umem_ptc_size = 1048576;	/* size of per-thread cache (in bytes) */
 
 uint_t umem_flags = 0;
+uintptr_t umem_tmem_off;
 
 mutex_t			umem_init_lock;		/* locks initialization */
 cond_t			umem_init_cv;		/* initialization CV */
@@ -482,6 +717,8 @@ thread_t		umem_init_thr;		/* thread initializing */
 int			umem_init_env_ready;	/* environ pre-initted */
 int			umem_ready = UMEM_READY_STARTUP;
 
+int			umem_ptc_enabled;	/* per-thread caching enabled */
+
 static umem_nofail_callback_t *nofail_callback;
 static mutex_t		umem_nofail_exit_lock;
 static thread_t		umem_nofail_exit_thr;
@@ -2838,6 +3075,24 @@ umem_alloc_sizes_remove(size_t size)
 	umem_alloc_sizes[i] = 0;
 }
 
+/*
+ * We've been called back from libc to indicate that thread is terminating and
+ * that it needs to release the per-thread memory that it has. We get to know
+ * which entry in the thread's tmem array the allocation came from. Currently
+ * this refers to first n umem_caches which makes this a pretty simple indexing
+ * job.
+ */
+static void
+umem_cache_tmem_cleanup(void *buf, int entry)
+{
+	size_t size;
+	umem_cache_t *cp;
+
+	size = umem_alloc_sizes[entry];
+	cp = umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT];
+	_umem_cache_free(cp, buf);
+}
+
 static int
 umem_cache_init(void)
 {
@@ -2953,6 +3208,16 @@ umem_cache_init(void)
 		umem_alloc_caches[i] = cp;
 	}
 
+	umem_tmem_off = _tmem_get_base();
+	_tmem_set_cleanup(umem_cache_tmem_cleanup);
+
+	if (umem_genasm_supported && !(umem_flags & UMF_DEBUG) &&
+	    !(umem_flags & UMF_NOMAGAZINE) &&
+	    umem_ptc_size > 0) {
+		umem_ptc_enabled = umem_genasm(umem_alloc_sizes,
+		    umem_alloc_caches, i) == 0 ? 1 : 0;
+	}
+
 	/*
 	 * Initialization cannot fail at this point.  Make the caches
 	 * visible to umem_alloc() and friends.
diff --git a/usr/src/lib/libumem/common/umem_base.h b/usr/src/lib/libumem/common/umem_base.h
index e78bebfb58..c845331fbc 100644
--- a/usr/src/lib/libumem/common/umem_base.h
+++ b/usr/src/lib/libumem/common/umem_base.h
@@ -22,12 +22,13 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
+ */
 
 #ifndef	_UMEM_BASE_H
 #define	_UMEM_BASE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <umem_impl.h>
 
 #ifdef	__cplusplus
@@ -75,6 +76,8 @@ extern volatile uint32_t umem_reaping;
 #define	UMEM_REAP_ADDING	0x00000001	/* umem_reap() is active */
 #define	UMEM_REAP_ACTIVE	0x00000002	/* update thread is reaping */
 
+extern uintptr_t umem_tmem_off;
+
 /*
  * umem.c: tunables
  */
@@ -97,6 +100,7 @@ extern size_t umem_lite_minsize;
 extern size_t umem_lite_maxalign;
 extern size_t umem_maxverify;
 extern size_t umem_minfirewall;
+extern size_t umem_ptc_size;
 
 extern uint32_t umem_flags;
 
@@ -139,6 +143,20 @@ extern int umem_create_update_thread(void);
 void umem_setup_envvars(int);
 void umem_process_envvars(void);
 
+/*
+ * umem_genasm.c: private interfaces
+ */
+extern const int umem_genasm_supported;
+extern int umem_genasm(int *, umem_cache_t **, int);
+
+/*
+ * malloc.c: traditional malloc/free interface for genasm
+ */
+extern void *umem_malloc(size_t);
+extern void umem_malloc_free(void *);
+extern void *_malloc(size_t);
+extern void _free(void *);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/lib/libumem/common/umem_impl.h b/usr/src/lib/libumem/common/umem_impl.h
index 84313c32ed..f63246e166 100644
--- a/usr/src/lib/libumem/common/umem_impl.h
+++ b/usr/src/lib/libumem/common/umem_impl.h
@@ -21,10 +21,13 @@
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Copyright 2012 Joyent, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+ */
+
 #ifndef _UMEM_IMPL_H
 #define	_UMEM_IMPL_H
 
@@ -63,6 +66,7 @@ extern "C" {
 
 #define	UMF_HASH	0x00000200	/* cache has hash table */
 #define	UMF_RANDOMIZE	0x00000400	/* randomize other umem_flags */
+#define	UMF_PTC		0x00000800	/* cache has per-thread caching */
 
 #define	UMF_BUFTAG	(UMF_DEADBEEF | UMF_REDZONE)
 #define	UMF_TOUCH	(UMF_BUFTAG | UMF_LITE | UMF_CONTENTS)
@@ -395,6 +399,13 @@ extern void umem_startup(caddr_t, size_t, size_t, caddr_t, caddr_t);
 extern int umem_add(caddr_t, size_t);
 #endif
 
+/*
+ * Private interface with libc for tcumem.
+ */
+extern uintptr_t _tmem_get_base(void);
+extern int _tmem_get_nentries(void);
+extern void _tmem_set_cleanup(void(*)(void *, int));
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/lib/libumem/i386/asm_subr.s b/usr/src/lib/libumem/i386/asm_subr.s
index 2edb2b49b5..5ad5345c6d 100644
--- a/usr/src/lib/libumem/i386/asm_subr.s
+++ b/usr/src/lib/libumem/i386/asm_subr.s
@@ -24,10 +24,32 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/asm_linkage.h>
 
+#define	NOP4	\
+	nop;	\
+	nop;	\
+	nop;	\
+	nop;
+
+#define NOP16	\
+	NOP4	\
+	NOP4	\
+	NOP4	\
+	NOP4
+
+#define	NOP64	\
+	NOP16	\
+	NOP16	\
+	NOP16	\
+	NOP16
+
+#define	NOP256	\
+	NOP64	\
+	NOP64	\
+	NOP64	\
+	NOP64
+
 #if defined(lint)
 
 void *
@@ -69,4 +91,25 @@ _breakpoint(void)
 	SET_SIZE(_breakpoint)
 #endif
 
+	ENTRY(_malloc)
+	jmp umem_malloc;
+	NOP256
+	NOP256
+#if defined(__amd64)
+	NOP64
+#endif
+	SET_SIZE(_malloc)
+
+	ENTRY(_free)
+	jmp umem_malloc_free;
+	NOP256
+	NOP256
+#if defined(__amd64)
+	NOP64
+#endif
+	SET_SIZE(_free)
+
+	ANSI_PRAGMA_WEAK2(malloc,_malloc,function)
+	ANSI_PRAGMA_WEAK2(free,_free,function)
+	
 #endif	/* lint */
diff --git a/usr/src/lib/libumem/i386/umem_genasm.c b/usr/src/lib/libumem/i386/umem_genasm.c
new file mode 100644
index 0000000000..530a83e486
--- /dev/null
+++ b/usr/src/lib/libumem/i386/umem_genasm.c
@@ -0,0 +1,595 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you find the blocks of assembly that follow confusing and
+ * you're questioning why they exist, please go read section 8 of the umem.c big
+ * theory statement. Next familiarize yourself with the malloc and free
+ * implementations in libumem's malloc.c.
+ *
+ * What follows is the i386 implementation of the thread caching automatic
+ * assembly generation. With i386 a function only has three registers it's
+ * allowed to change without restoring them: eax, ecx, and edx. All others have
+ * to be preserved. Since the set of registers we have available is so small, we
+ * have to make use of esi, ebx, and edi and save their original values to the
+ * stack.
+ *
+ * Malloc register usage:
+ * 	o. esi: Size of the malloc (passed into us and modified)
+ * 	o. edi: Size of the cache
+ * 	o. eax: Buffer to return
+ * 	o. ebx: Scratch space and temporary values
+ * 	o. ecx: Pointer to the tmem_t in the ulwp_t.
+ * 	o. edx: Pointer to the tmem_t array of roots
+ *
+ * Free register usage:
+ * 	o. esi: Size of the malloc (passed into us and modified)
+ * 	o. edi: Size of the cache
+ * 	o. eax: Buffer to free
+ * 	o. ebx: Scratch space and temporary values
+ * 	o. ecx: Pointer to the tmem_t in the ulwp_t.
+ * 	o. edx: Pointer to the tmem_t array of roots
+ *
+ * Once we determine what cache we are using, we increment %edx to the
+ * appropriate offset and set %edi with the size of the cache. This means that
+ * when we break out to the normal buffer allocation point %edx contains the
+ * head of the linked list and %edi is the amount that we have to adjust the
+ * total amount cached by the thread.
+ *
+ * Each block of assembly has psuedocode that describes its purpose.
+ */
+
+#include <inttypes.h>
+#include <strings.h>
+#include <umem_impl.h>
+#include "umem_base.h"
+
+#include <atomic.h>
+
+const int umem_genasm_supported = 1;
+static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
+static size_t umem_genasm_msize = 512;
+static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
+static size_t umem_genasm_fsize = 512;
+static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
+static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
+/*
+ * The maximum number of caches we can support. We use a single byte addl so
+ * this is 255 (UINT8_MAX) / sizeof (uintptr_t). In this case 63
+ */
+#define	UMEM_GENASM_MAX32	63
+
+#define	PTC_JMPADDR(dest, src)	(dest - (src + 4))
+#define	PTC_ROOT_SIZE	sizeof (uintptr_t)
+#define	MULTINOP	0x0000441f0f
+
+/*
+ * void *ptcmalloc(size_t orig_size);
+ *
+ * size_t size = orig_size + 8;
+ *
+ * if (size < orig_size)
+ * 	goto tomalloc;		! This is overflow
+ *
+ * if (size > cache_size)
+ * 	goto tomalloc;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define	PTC_MALINIT_JOUT	0x0e
+#define	PTC_MALINIT_MCS	0x14
+#define	PTC_MALINIT_JOV	0x1a
+#define	PTC_MALINIT_SOFF	0x27
+static const uint8_t malinit[] = {
+	0x55,					/* pushl %ebp */
+	0x89, 0xe5,				/* movl %esp, %ebp */
+	0x57,					/* pushl %edi */
+	0x56,					/* pushl %esi */
+	0x53,					/* pushl %ebx */
+	0x8b, 0x75, 0x08,			/* movl 0x8(%ebp), %esi */
+	0x83, 0xc6, 0x08,			/* addl $0x8,%esi */
+	0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, 	/* jc +$JMP (errout) */
+	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
+	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +$JMP (errout) */
+	0x65, 0x8b, 0x0d, 0x00, 0x00, 0x00, 0x00, 	/* movl %gs:0x0,%ecx */
+	0x81, 0xc1, 0x00, 0x00,	0x00, 0x00, 	/* addl $OFF, %ecx */
+	0x8d, 0x51, 0x04			/* leal 0x4(%ecx), %edx */
+};
+
+/*
+ * void ptcfree(void *buf);
+ *
+ * if (buf == NULL)
+ * 	return;
+ *
+ * malloc_data_t *tag = buf;
+ * tag--;
+ * int size = tag->malloc_size;
+ * int tagtval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
+ *
+ * if (tagval != MALLOC_MAGIC)
+ * 	goto tofree;
+ *
+ * if (size > cache_max)
+ * 	goto tofree;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define	PTC_FRINI_JDONE	0x0d
+#define	PTC_FRINI_JFREE	0x23
+#define	PTC_FRINI_MCS	0x29
+#define	PTC_FRINI_JOV	0x2f
+#define	PTC_FRINI_SOFF	0x3c
+static const uint8_t freeinit[] = {
+	0x55,					/* pushl %ebp */
+	0x89, 0xe5,				/* movl %esp, %ebp */
+	0x57,					/* pushl %edi */
+	0x56,					/* pushl %esi */
+	0x53,					/* pushl %ebx */
+	0x8b, 0x45, 0x08,			/* movl 0x8(%ebp), %eax */
+	0x85, 0xc0,				/* testl %eax, %eax */
+	0x0f, 0x84, 0x00, 0x00, 0x00, 0x00,	/* je $JDONE (done) */
+	0x83, 0xe8, 0x08,			/* subl $0x8,%eax */
+	0x8b, 0x30,				/* movl (%eax),%esi */
+	0x8b, 0x50, 0x04,			/* movl 0x4(%eax),%edx */
+	0x01, 0xf2,				/* addl %esi,%edx */
+	0x81, 0xfa, 0x00, 0xc0, 0x10, 0x3a,	/* cmpl MAGIC32, %edx */
+	0x0f, 0x85, 0x00, 0x00, 0x00, 0x00,	/* jne +JFREE (goto freebuf) */
+
+	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
+	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +$JMP (errout) */
+	0x65, 0x8b, 0x0d, 0x00, 0x0, 0x00, 0x00, /* movl %gs:0x0,%ecx */
+	0x81, 0xc1, 0x00, 0x00,	0x00, 0x00,	/* addl $0xOFF, %ecx */
+	0x8d, 0x51, 0x04			/* leal 0x4(%ecx),%edx */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ * } else ...				! goto next cache
+ */
+#define	PTC_INICACHE_CMP	0x02
+#define	PTC_INICACHE_SIZE 0x09
+#define	PTC_INICACHE_JMP	0x0e
+static const uint8_t inicache[] = {
+	0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
+	0x77, 0x0a,				/* ja +0xa */
+	0xbf, 0xff, 0x00, 0x00, 0x00, 		/* movl sizeof ($C0), %edi */
+	0xe9, 0x00, 0x00, 0x00, 0x00		/* jmp +$JMP (allocbuf) */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ *	roots += $CACHE_NUM;
+ * } else ...				! goto next cache
+ */
+#define	PTC_GENCACHE_CMP	0x02
+#define	PTC_GENCACHE_NUM	0x0a
+#define	PTC_GENCACHE_SIZE 0x0c
+#define	PTC_GENCACHE_JMP	0x11
+static const uint8_t gencache[] = {
+	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($CACHE), %esi */
+	0x77, 0x0d,				/* ja +0xd (next cache) */
+	0x83, 0xc2, 0x00,			/* addl $4*$ii, %edx */
+	0xbf, 0x00, 0x00, 0x00, 0x00, 		/* movl sizeof ($CACHE), %edi */
+	0xe9, 0x00, 0x00, 0x00, 0x00 		/* jmp +$JMP (allocbuf) */
+};
+
+/*
+ * else if (size <= $CACHE_SIZE) {
+ *	csize = $CACHE_SIZE;
+ *	roots += $CACHE_NUM;
+ * } else {
+ *	goto tofunc; 			! goto tomalloc if ptcmalloc.
+ * }					! goto tofree if ptcfree.
+ */
+#define	PTC_FINCACHE_CMP 0x02
+#define	PTC_FINCACHE_JMP	0x07
+#define	PTC_FINCACHE_NUM 0x0a
+#define	PTC_FINCACHE_SIZE 0x0c
+static const uint8_t fincache[] = {
+	0x81, 0xfe, 0xff, 0x00, 0x00, 0x00,	/* cmpl sizeof ($CLAST), %esi */
+	0x77, 0x00,				/* ja +$JMP (to errout) */
+	0x83, 0xc2, 0x00,			/* addl $4*($NCACHES-1), %edx */
+	0xbf, 0x00, 0x00, 0x00, 0x00, 		/* movl sizeof ($CLAST), %edi */
+};
+
+/*
+ * if (*root == NULL)
+ * 	goto tomalloc;
+ *
+ * malloc_data_t *ret = *root;
+ * *root = *(void **)ret;
+ * t->tm_size += csize;
+ * ret->malloc_size = size;
+ *
+ * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ * ret++;
+ *
+ * return ((void *)ret);
+ * tomalloc:
+ * 	return (malloc(orig_size));
+ */
+#define	PTC_MALFINI_ALLABEL	0x00
+#define	PTC_MALFINI_JMLABEL	0x20
+#define	PTC_MALFINI_JMADDR	0x25
+static const uint8_t malfini[] = {
+	/* allocbuf: */
+	0x8b, 0x02,			/* movl (%edx), %eax */
+	0x85, 0xc0,			/* testl %eax, %eax */
+	0x74, 0x1a,			/* je +0x1a (errout) */
+	0x8b, 0x18,			/* movl (%eax), %esi */
+	0x89, 0x1a,			/* movl %esi, (%edx) */
+	0x29, 0x39,			/* subl %edi, (%ecx) */
+	0x89, 0x30,			/* movl %esi, ($eax) */
+	0xba, 0x00, 0xc0, 0x10, 0x3a,	/* movl $0x3a10c000,%edx */
+	0x29, 0xf2,			/* subl %esi, %edx */
+	0x89, 0x50, 0x04,		/* movl %edx, 0x4(%eax) */
+	0x83, 0xc0, 0x08,		/* addl %0x8, %eax */
+	0x5b,				/* popl %ebx */
+	0x5e,				/* popl %esi */
+	0x5f,				/* popl %edi */
+	0xc9,				/* leave */
+	0xc3,				/* ret */
+	/* errout: */
+	0x5b,				/* popl %ebx */
+	0x5e,				/* popl %esi */
+	0x5f,				/* popl %edi */
+	0xc9,				/* leave */
+	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp $malloc */
+};
+
+/*
+ * if (t->tm_size + csize > umem_ptc_size)
+ * 	goto tofree;
+ *
+ * t->tm_size += csize
+ * *(void **)tag = *root;
+ * *root = tag;
+ * return;
+ * tofree:
+ * 	free(buf);
+ * 	return;
+ */
+#define	PTC_FRFINI_RBUFLABEL	0x00
+#define	PTC_FRFINI_CACHEMAX	0x06
+#define	PTC_FRFINI_DONELABEL	0x14
+#define	PTC_FRFINI_JFLABEL	0x19
+#define	PTC_FRFINI_JFADDR	0x1e
+static const uint8_t freefini[] = {
+	/* freebuf: */
+	0x8b, 0x19,				/* movl (%ecx),%ebx */
+	0x01, 0xfb,				/* addl %edi,%ebx */
+	0x81, 0xfb, 0x00, 0x00, 0x00, 0x00, 	/* cmpl maxsize, %ebx */
+	0x73, 0x0d,				/* jae +0xd <tofree> */
+	0x01, 0x39,				/* addl %edi,(%ecx) */
+	0x8b, 0x3a,				/* movl (%edx),%edi */
+	0x89, 0x38,				/* movl %edi,(%eax) */
+	0x89, 0x02,				/* movl %eax,(%edx) */
+	/* done: */
+	0x5b,					/* popl %ebx */
+	0x5e,					/* popl %esi */
+	0x5f,					/* popl %edi */
+	0xc9,					/* leave */
+	0xc3,					/* ret */
+	/* realfree: */
+	0x5b,					/* popl %ebx */
+	0x5e,					/* popl %esi */
+	0x5f,					/* popl %edi */
+	0xc9,					/* leave */
+	0xe9, 0x00, 0x00, 0x00, 0x00		/* jmp free */
+};
+
+/*
+ * Construct the initial part of malloc. off contains the offset from curthread
+ * to the root of the tmem structure. ep is the address of the label to error
+ * and jump to free. csize is the size of the largest umem_cache in ptcumem.
+ */
+static int
+genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
+{
+	uint32_t addr;
+
+	bcopy(malinit, bp, sizeof (malinit));
+	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
+	bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
+	bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
+	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
+	bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
+	bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
+
+	return (sizeof (malinit));
+}
+
+static int
+genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mc)
+{
+	uint32_t addr;
+
+	bcopy(freeinit, bp, sizeof (freeinit));
+	addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
+	bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
+	addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
+	bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
+	bcopy(&mc, bp + PTC_FRINI_MCS, sizeof (mc));
+	addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
+	bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
+	bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
+	return (sizeof (freeinit));
+}
+
+/*
+ * Create the initial cache entry of the specified size. The value of ap tells
+ * us what the address of the label to try and allocate a buffer. This value is
+ * an offset from the current base to that value.
+ */
+static int
+genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
+{
+	uint32_t addr;
+
+	bcopy(inicache, bp, sizeof (inicache));
+	bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
+	addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
+	ASSERT(addr != 0);
+	bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
+
+	return (sizeof (inicache));
+}
+
+static int
+genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
+{
+	uint32_t addr;
+	uint8_t	coff;
+
+	ASSERT(256 / PTC_ROOT_SIZE > num);
+	ASSERT(num != 0);
+	bcopy(gencache, bp, sizeof (gencache));
+	bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
+	coff = num * PTC_ROOT_SIZE;
+	bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
+	addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
+	bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
+
+	return (sizeof (gencache));
+}
+
+static int
+genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
+{
+	uint8_t addr;
+
+	ASSERT(ep <= 0xff && ep > 7);
+	ASSERT(256 / PTC_ROOT_SIZE > num);
+	bcopy(fincache, bp, sizeof (fincache));
+	bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
+	bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
+	addr = num * PTC_ROOT_SIZE;
+	bcopy(&addr, bp + PTC_FINCACHE_NUM, sizeof (addr));
+	addr = ep - PTC_FINCACHE_JMP - 1;
+	bcopy(&addr, bp + PTC_FINCACHE_JMP, sizeof (addr));
+
+	return (sizeof (fincache));
+}
+
+static int
+genasm_malfini(uint8_t *bp, uintptr_t mptr)
+{
+	uint32_t addr;
+
+	bcopy(malfini, bp, sizeof (malfini));
+	addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
+	bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
+
+	return (sizeof (malfini));
+}
+
+static int
+genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
+{
+	uint32_t addr;
+
+	bcopy(freefini, bp, sizeof (freefini));
+	bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
+	addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
+	bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
+
+	return (sizeof (freefini));
+}
+
+/*
+ * The malloc inline assembly is constructed as follows:
+ *
+ * o Malloc prologue assembly
+ * o Generic first-cache check
+ * o n Generic cache checks (where n = _tmem_get_entries() - 2)
+ * o Generic last-cache check
+ * o Malloc epilogue assembly
+ *
+ * Generally there are at least three caches. When there is only one cache we
+ * only use the generic last-cache. In the case where there are two caches, we
+ * just leave out the middle ones.
+ */
+static int
+genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+	int ii, off;
+	uint8_t *bp;
+	size_t total;
+	uint32_t allocoff, erroff;
+
+	total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
+
+	if (nents >= 2)
+		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+	if (total > len)
+		return (1);
+
+	erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
+	allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
+
+	bp = base;
+
+	off = genasm_malinit(bp, umem_tmem_off, erroff,
+	    umem_alloc_sizes[nents-1]);
+	bp += off;
+	allocoff -= off;
+	erroff -= off;
+
+	if (nents > 1) {
+		off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
+		bp += off;
+		allocoff -= off;
+		erroff -= off;
+	}
+
+	for (ii = 1; ii < nents - 1; ii++) {
+		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
+		bp += off;
+		allocoff -= off;
+		erroff -= off;
+	}
+
+	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+	    erroff);
+	bp += genasm_malfini(bp, umem_genasm_omptr);
+	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+	return (0);
+}
+
+static int
+genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+	uint8_t *bp;
+	int ii, off;
+	size_t total;
+	uint32_t rbufoff, retoff, erroff;
+
+	/* Assume that nents has already been audited for us */
+	total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
+	if (nents >= 2)
+		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+	if (total > len)
+		return (1);
+
+	erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
+	rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
+	retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
+
+	bp = base;
+
+	off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
+	    umem_alloc_sizes[nents - 1]);
+	bp += off;
+	erroff -= off;
+	rbufoff -= off;
+
+	if (nents > 1) {
+		off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
+		bp += off;
+		erroff -= off;
+		rbufoff -= off;
+	}
+
+	for (ii = 1; ii < nents - 1; ii++) {
+		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
+		bp += off;
+		rbufoff -= off;
+		erroff -= off;
+	}
+
+	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+	    erroff);
+	bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
+	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+	return (0);
+}
+
+int
+umem_genasm(int *alloc_sizes, umem_cache_t **caches, int ncaches)
+{
+	int nents, i;
+	uint8_t *mptr;
+	uint8_t *fptr;
+	uint64_t v, *vptr;
+
+	mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
+	fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
+	if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
+	    umem_genasm_fptr == 0 || umem_genasm_fsize == 0)
+		return (1);
+
+	/*
+	 * The total number of caches that we can service is the minimum of:
+	 *  o the amount supported by libc
+	 *  o the total number of umem caches
+	 *  o we use a single byte addl, so it's 255 / sizeof (uintptr_t). For
+	 *    32-bit, this is 63.
+	 */
+	nents = _tmem_get_nentries();
+
+	if (UMEM_GENASM_MAX32 < nents)
+		nents = UMEM_GENASM_MAX32;
+
+	if (ncaches < nents)
+		nents = ncaches;
+
+	/* Based on our constraints, this is not an error */
+	if (nents == 0 || umem_ptc_size == 0)
+		return (0);
+
+	/* Take into account the jump */
+	if (genasm_malloc(mptr, umem_genasm_msize, nents,
+	    alloc_sizes) != 0)
+		return (1);
+
+	if (genasm_free(fptr, umem_genasm_fsize, nents,
+	    alloc_sizes) != 0)
+		return (1);
+
+	/* nop out the jump with a multibyte jump */
+	vptr = (void *)umem_genasm_mptr;
+	v = MULTINOP;
+	v |= *vptr & (0xffffffULL << 40);
+	(void) atomic_swap_64(vptr, v);
+	vptr = (void *)umem_genasm_fptr;
+	v = MULTINOP;
+	v |= *vptr & (0xffffffULL << 40);
+	(void) atomic_swap_64(vptr, v);
+
+	for (i = 0; i < nents; i++)
+		caches[i]->cache_flags |= UMF_PTC;
+
+	return (0);
+}
diff --git a/usr/src/lib/libumem/sparc/umem_genasm.c b/usr/src/lib/libumem/sparc/umem_genasm.c
new file mode 100644
index 0000000000..4bdea8122d
--- /dev/null
+++ b/usr/src/lib/libumem/sparc/umem_genasm.c
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you wonder why this seemingly empty file exists, it's because
+ * there is no sparc implementation for ptcumem. Go read libumem's big theory
+ * statement in lib/libumem/common/umem.c, particularly section eight.
+ */
+
+const int umem_genasm_supported = 0;
+
+/*ARGSUSED*/
+int
+umem_genasm(int *cp, int nc)
+{
+	return (1);
+}
diff --git a/usr/src/man/man3malloc/umem_alloc.3malloc b/usr/src/man/man3malloc/umem_alloc.3malloc
index cc8e3df369..d8680ca083 100644
--- a/usr/src/man/man3malloc/umem_alloc.3malloc
+++ b/usr/src/man/man3malloc/umem_alloc.3malloc
@@ -174,6 +174,19 @@ Set the underlying function used to allocate memory. This option can be set to
 \fBmmap\fR(2)-based source. If set to a value that is not supported, \fBsbrk\fR
 will be used.
 .RE
+.sp
+.ne 2
+.na
+\fB\fBperthread_cache\fR=\fBsize\fR\fR
+.ad
+.RS 16n
+libumem allows for each thread to cache recently freed small allocations for
+future allocations. The size argument, which accepts k, m, g, and t, suffixes
+denotes the maximum amount of memory each thread can use for this purpose. The
+default amount used is 1 MB. Any buffers in the per-thread cache are freed when
+the thread exits. The efficacy of the per-thread cache can be determined with
+the \fB::umastat\fR \fBmdb\fR(1) \fIdcmd\fR debugger command.
+.RE
 
 .ne 2
 .na
author	Robert Mustacchi <rm@joyent.com>	2013-09-29 13:06:51 -0700
committer	Robert Mustacchi <rm@joyent.com>	2014-01-21 18:20:39 -0800
commit	4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72 (patch)
tree	8f95ebd8dfeb9ab49e53704d900b2d0f0f217b37
parent	38849194df07385a46363bb46861688fde59a98a (diff)
download	illumos-joyent-4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72.tar.gz