summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Mustacchi <rm@joyent.com>2013-09-29 13:06:51 -0700
committerRobert Mustacchi <rm@joyent.com>2014-01-21 18:20:39 -0800
commit4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72 (patch)
tree8f95ebd8dfeb9ab49e53704d900b2d0f0f217b37
parent38849194df07385a46363bb46861688fde59a98a (diff)
downloadillumos-joyent-4f364e7c95ee7fd9d5bbeddc1940e92405bb0e72.tar.gz
4489 need ptcumem
Reviewed by: Bryan Cantrill <bryan@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net> Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r--exception_lists/check_rtime2
-rw-r--r--usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c10
-rw-r--r--usr/src/cmd/mdb/common/modules/libc/libc.c9
-rw-r--r--usr/src/cmd/mdb/common/modules/libumem/libumem.c197
-rw-r--r--usr/src/cmd/mdb/common/modules/libumem/umem.c214
-rw-r--r--usr/src/cmd/mdb/intel/amd64/libumem/Makefile1
-rw-r--r--usr/src/cmd/mdb/intel/ia32/libumem/Makefile1
-rw-r--r--usr/src/cmd/mdb/sparc/v7/libumem/Makefile1
-rw-r--r--usr/src/cmd/mdb/sparc/v9/libumem/Makefile1
-rw-r--r--usr/src/lib/libc/amd64/Makefile3
-rw-r--r--usr/src/lib/libc/i386/Makefile.com3
-rw-r--r--usr/src/lib/libc/inc/thr_uberdata.h29
-rw-r--r--usr/src/lib/libc/port/mapfile-vers4
-rw-r--r--usr/src/lib/libc/port/threads/thr.c4
-rw-r--r--usr/src/lib/libc/port/threads/tmem.c85
-rw-r--r--usr/src/lib/libc/sparc/Makefile.com2
-rw-r--r--usr/src/lib/libc/sparcv9/Makefile.com2
-rw-r--r--usr/src/lib/libumem/Makefile.com9
-rw-r--r--usr/src/lib/libumem/amd64/umem_genasm.c604
-rw-r--r--usr/src/lib/libumem/common/envvar.c10
-rw-r--r--usr/src/lib/libumem/common/linktest_stand.c3
-rw-r--r--usr/src/lib/libumem/common/malloc.c15
-rw-r--r--usr/src/lib/libumem/common/mapfile-vers12
-rw-r--r--usr/src/lib/libumem/common/stub_stand.c21
-rw-r--r--usr/src/lib/libumem/common/umem.c269
-rw-r--r--usr/src/lib/libumem/common/umem_base.h22
-rw-r--r--usr/src/lib/libumem/common/umem_impl.h13
-rw-r--r--usr/src/lib/libumem/i386/asm_subr.s47
-rw-r--r--usr/src/lib/libumem/i386/umem_genasm.c595
-rw-r--r--usr/src/lib/libumem/sparc/umem_genasm.c38
-rw-r--r--usr/src/man/man3malloc/umem_alloc.3malloc13
31 files changed, 2184 insertions, 55 deletions
diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime
index ce606dc0ea..fce897b09b 100644
--- a/exception_lists/check_rtime
+++ b/exception_lists/check_rtime
@@ -63,6 +63,8 @@ SKIP ^usr/lib/sysevent/modules/picl_slm.so$
# Objects that are allowed to have executable data segments
EXEC_DATA ^MACH(lib)/ld\.so\.1$
EXEC_DATA ^lib/libc\.so\.1$ # 6524709, 32-bit, needed for x86 only
+EXEC_DATA ^lib/amd64/libumem\.so\.1$ # ptcumem
+EXEC_DATA ^lib/libumem\.so\.1$ # ptcumem
EXEC_DATA ^opt/SUNWdtrt/tst/.*/ustack/tst\.helper\.exe$
EXEC_DATA ^platform/.*/MACH(kernel)/unix$
EXEC_DATA ^platform/.*/multiboot$
diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c b/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
index 4193b0dcd5..c2289ec7f7 100644
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_umemglue.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_debug.h>
#include <mdb/mdb_err.h>
#include <mdb/mdb_io.h>
@@ -101,6 +99,14 @@ umem_atomic_add_64(uint64_t *target, int64_t delta)
*target = *target + delta;
}
+uint64_t
+umem_atomic_swap_64(volatile uint64_t *t, uint64_t v)
+{
+ uint64_t old = *t;
+ *t = v;
+ return (old);
+}
+
/*
* Standalone umem must be manually initialized
*/
diff --git a/usr/src/cmd/mdb/common/modules/libc/libc.c b/usr/src/cmd/mdb/common/modules/libc/libc.c
index 27dcade228..44e4f49b87 100644
--- a/usr/src/cmd/mdb/common/modules/libc/libc.c
+++ b/usr/src/cmd/mdb/common/modules/libc/libc.c
@@ -23,6 +23,9 @@
* Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
#include <sys/mdb_modapi.h>
#include <mdb/mdb_whatis.h>
@@ -681,6 +684,12 @@ d_ulwp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
prt_addr((void *)(addr + OFFSET(ul_spinlock)), 1),
prt_addr((void *)(addr + OFFSET(ul_fpuenv)), 0));
+ HD("tmem.size &tmem.roots");
+ mdb_printf(OFFSTR "%-21H %s\n",
+ OFFSET(ul_tmem),
+ ulwp.ul_tmem.tm_size,
+ prt_addr((void *)(addr + OFFSET(ul_tmem) + sizeof (size_t)), 0));
+
return (DCMD_OK);
}
diff --git a/usr/src/cmd/mdb/common/modules/libumem/libumem.c b/usr/src/cmd/mdb/common/modules/libumem/libumem.c
index 4a77c5aa82..0984edbdf0 100644
--- a/usr/src/cmd/mdb/common/modules/libumem/libumem.c
+++ b/usr/src/cmd/mdb/common/modules/libumem/libumem.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
#include "umem.h"
#include <libproc.h>
#include <mdb/mdb_modapi.h>
@@ -34,6 +38,8 @@
#include <umem_impl.h>
#include <sys/vmem_impl_user.h>
+#include <thr_uberdata.h>
+#include <stdio.h>
#include "umem_pagesize.h"
@@ -44,24 +50,33 @@ typedef struct datafmt {
char *fmt;
} datafmt_t;
+static datafmt_t ptcfmt[] = {
+ { " ", "tid", "---", "%3u " },
+ { " memory", " cached", "-------", "%7lH " },
+ { " %", "cap", "---", "%3u " },
+ { " %", NULL, "---", "%3u " },
+ { NULL, NULL, NULL, NULL }
+};
+
static datafmt_t umemfmt[] = {
{ "cache ", "name ",
"-------------------------", "%-25s " },
{ " buf", " size", "------", "%6u " },
- { " buf", "in use", "------", "%6u " },
- { " buf", " total", "------", "%6u " },
- { " memory", " in use", "---------", "%9u " },
+ { " buf", " in use", "-------", "%7u " },
+ { " buf", " in ptc", "-------", "%7s " },
+ { " buf", " total", "-------", "%7u " },
+ { " memory", " in use", "-------", "%7H " },
{ " alloc", " succeed", "---------", "%9u " },
- { "alloc", " fail", "-----", "%5llu " },
+ { "alloc", " fail", "-----", "%5llu" },
{ NULL, NULL, NULL, NULL }
};
static datafmt_t vmemfmt[] = {
{ "vmem ", "name ",
"-------------------------", "%-*s " },
- { " memory", " in use", "---------", "%9llu " },
- { " memory", " total", "----------", "%10llu " },
- { " memory", " import", "---------", "%9llu " },
+ { " memory", " in use", "---------", "%9H " },
+ { " memory", " total", "----------", "%10H " },
+ { " memory", " import", "---------", "%9H " },
{ " alloc", " succeed", "---------", "%9llu " },
{ "alloc", " fail", "-----", "%5llu " },
{ NULL, NULL, NULL, NULL }
@@ -105,14 +120,105 @@ typedef struct umastat_vmem {
int kv_fail;
} umastat_vmem_t;
+/*ARGSUSED*/
+static int
+umastat_cache_nptc(uintptr_t addr, const umem_cache_t *cp, int *nptc)
+{
+ if (!(cp->cache_flags & UMF_PTC))
+ return (WALK_NEXT);
+
+ (*nptc)++;
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_cache_hdr(uintptr_t addr, const umem_cache_t *cp, void *ignored)
+{
+ if (!(cp->cache_flags & UMF_PTC))
+ return (WALK_NEXT);
+
+ mdb_printf("%3d ", cp->cache_bufsize);
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp_ptc(uintptr_t addr, void *buf, int *nbufs)
+{
+ (*nbufs)++;
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp_cache(uintptr_t addr, const umem_cache_t *cp, ulwp_t *ulwp)
+{
+ char walk[60];
+ int nbufs = 0;
+
+ if (!(cp->cache_flags & UMF_PTC))
+ return (WALK_NEXT);
+
+ (void) snprintf(walk, sizeof (walk), "umem_ptc_%d", cp->cache_bufsize);
+
+ if (mdb_pwalk(walk, (mdb_walk_cb_t)umastat_lwp_ptc,
+ &nbufs, (uintptr_t)ulwp->ul_self) == -1) {
+ mdb_warn("unable to walk '%s'", walk);
+ return (WALK_ERR);
+ }
+
+ mdb_printf("%3d ", ulwp->ul_tmem.tm_size ?
+ (nbufs * cp->cache_bufsize * 100) / ulwp->ul_tmem.tm_size : 0);
+
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_lwp(uintptr_t addr, const ulwp_t *ulwp, void *ignored)
+{
+ size_t size;
+ datafmt_t *dfp = ptcfmt;
+
+ mdb_printf((dfp++)->fmt, ulwp->ul_lwpid);
+ mdb_printf((dfp++)->fmt, ulwp->ul_tmem.tm_size);
+
+ if (umem_readvar(&size, "umem_ptc_size") == -1) {
+ mdb_warn("unable to read 'umem_ptc_size'");
+ return (WALK_ERR);
+ }
+
+ mdb_printf((dfp++)->fmt, (ulwp->ul_tmem.tm_size * 100) / size);
+
+ if (mdb_walk("umem_cache",
+ (mdb_walk_cb_t)umastat_lwp_cache, (void *)ulwp) == -1) {
+ mdb_warn("can't walk 'umem_cache'");
+ return (WALK_ERR);
+ }
+
+ mdb_printf("\n");
+
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+static int
+umastat_cache_ptc(uintptr_t addr, const void *ignored, int *nptc)
+{
+ (*nptc)++;
+ return (WALK_NEXT);
+}
+
static int
umastat_cache(uintptr_t addr, const umem_cache_t *cp, umastat_vmem_t **kvp)
{
umastat_vmem_t *kv;
datafmt_t *dfp = umemfmt;
+ char buf[10];
int magsize;
- int avail, alloc, total;
+ int avail, alloc, total, nptc = 0;
size_t meminuse = (cp->cache_slab_create - cp->cache_slab_destroy) *
cp->cache_slabsize;
@@ -130,6 +236,21 @@ umastat_cache(uintptr_t addr, const umem_cache_t *cp, umastat_vmem_t **kvp)
(void) mdb_pwalk("umem_cpu_cache", cpu_avail, &avail, addr);
(void) mdb_pwalk("umem_slab_partial", slab_avail, &avail, addr);
+ if (cp->cache_flags & UMF_PTC) {
+ char walk[60];
+
+ (void) snprintf(walk, sizeof (walk),
+ "umem_ptc_%d", cp->cache_bufsize);
+
+ if (mdb_walk(walk,
+ (mdb_walk_cb_t)umastat_cache_ptc, &nptc) == -1) {
+ mdb_warn("unable to walk '%s'", walk);
+ return (WALK_ERR);
+ }
+
+ (void) snprintf(buf, sizeof (buf), "%d", nptc);
+ }
+
for (kv = *kvp; kv != NULL; kv = kv->kv_next) {
if (kv->kv_addr == (uintptr_t)cp->cache_arena)
goto out;
@@ -147,6 +268,7 @@ out:
mdb_printf((dfp++)->fmt, cp->cache_name);
mdb_printf((dfp++)->fmt, cp->cache_bufsize);
mdb_printf((dfp++)->fmt, total - avail);
+ mdb_printf((dfp++)->fmt, cp->cache_flags & UMF_PTC ? buf : "-");
mdb_printf((dfp++)->fmt, total);
mdb_printf((dfp++)->fmt, meminuse);
mdb_printf((dfp++)->fmt, alloc);
@@ -165,8 +287,8 @@ umastat_vmem_totals(uintptr_t addr, const vmem_t *v, umastat_vmem_t *kv)
if (kv == NULL || kv->kv_alloc == 0)
return (WALK_NEXT);
- mdb_printf("Total [%s]%*s %6s %6s %6s %9u %9u %5u\n", v->vm_name,
- 17 - strlen(v->vm_name), "", "", "", "",
+ mdb_printf("Total [%s]%*s %6s %7s %7s %7s %7H %9u %5u\n", v->vm_name,
+ 17 - strlen(v->vm_name), "", "", "", "", "",
kv->kv_meminuse, kv->kv_alloc, kv->kv_fail);
return (WALK_NEXT);
@@ -209,20 +331,67 @@ umastat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
umastat_vmem_t *kv = NULL;
datafmt_t *dfp;
+ int nptc = 0, i;
if (argc != 0)
return (DCMD_USAGE);
+ /*
+ * We need to determine if we have any caches that have per-thread
+ * caching enabled.
+ */
+ if (mdb_walk("umem_cache",
+ (mdb_walk_cb_t)umastat_cache_nptc, &nptc) == -1) {
+ mdb_warn("can't walk 'umem_cache'");
+ return (DCMD_ERR);
+ }
+
+ if (nptc) {
+ for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+ mdb_printf("%s ", dfp->hdr1);
+
+ for (i = 0; i < nptc; i++)
+ mdb_printf("%s ", dfp->hdr1);
+
+ mdb_printf("\n");
+
+ for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+ mdb_printf("%s ", dfp->hdr2);
+
+ if (mdb_walk("umem_cache",
+ (mdb_walk_cb_t)umastat_cache_hdr, NULL) == -1) {
+ mdb_warn("can't walk 'umem_cache'");
+ return (DCMD_ERR);
+ }
+
+ mdb_printf("\n");
+
+ for (dfp = ptcfmt; dfp->hdr2 != NULL; dfp++)
+ mdb_printf("%s ", dfp->dashes);
+
+ for (i = 0; i < nptc; i++)
+ mdb_printf("%s ", dfp->dashes);
+
+ mdb_printf("\n");
+
+ if (mdb_walk("ulwp", (mdb_walk_cb_t)umastat_lwp, NULL) == -1) {
+ mdb_warn("can't walk 'ulwp'");
+ return (DCMD_ERR);
+ }
+
+ mdb_printf("\n");
+ }
+
for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
- mdb_printf("%s ", dfp->hdr1);
+ mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->hdr1);
mdb_printf("\n");
for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
- mdb_printf("%s ", dfp->hdr2);
+ mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->hdr2);
mdb_printf("\n");
for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
- mdb_printf("%s ", dfp->dashes);
+ mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->dashes);
mdb_printf("\n");
if (mdb_walk("umem_cache", (mdb_walk_cb_t)umastat_cache, &kv) == -1) {
@@ -231,7 +400,7 @@ umastat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
for (dfp = umemfmt; dfp->hdr1 != NULL; dfp++)
- mdb_printf("%s ", dfp->dashes);
+ mdb_printf("%s%s", dfp == umemfmt ? "" : " ", dfp->dashes);
mdb_printf("\n");
if (mdb_walk("vmem", (mdb_walk_cb_t)umastat_vmem_totals, kv) == -1) {
diff --git a/usr/src/cmd/mdb/common/modules/libumem/umem.c b/usr/src/cmd/mdb/common/modules/libumem/umem.c
index 26a62c7b52..73dd4d6e89 100644
--- a/usr/src/cmd/mdb/common/modules/libumem/umem.c
+++ b/usr/src/cmd/mdb/common/modules/libumem/umem.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
@@ -36,6 +36,8 @@
#include <alloca.h>
#include <limits.h>
#include <mdb/mdb_whatis.h>
+#include <thr_uberdata.h>
+#include <stdio.h>
#include "misc.h"
#include "leaky.h"
@@ -104,12 +106,58 @@ umem_update_variables(void)
return (0);
}
+static int
+umem_ptc_walk_init(mdb_walk_state_t *wsp)
+{
+ if (wsp->walk_addr == NULL) {
+ if (mdb_layered_walk("ulwp", wsp) == -1) {
+ mdb_warn("couldn't walk 'ulwp'");
+ return (WALK_ERR);
+ }
+ }
+
+ return (WALK_NEXT);
+}
+
+static int
+umem_ptc_walk_step(mdb_walk_state_t *wsp)
+{
+ uintptr_t this;
+ int rval;
+
+ if (wsp->walk_layer != NULL) {
+ this = (uintptr_t)((ulwp_t *)wsp->walk_layer)->ul_self +
+ (uintptr_t)wsp->walk_arg;
+ } else {
+ this = wsp->walk_addr + (uintptr_t)wsp->walk_arg;
+ }
+
+ for (;;) {
+ if (mdb_vread(&this, sizeof (void *), this) == -1) {
+ mdb_warn("couldn't read ptc buffer at %p", this);
+ return (WALK_ERR);
+ }
+
+ if (this == NULL)
+ break;
+
+ rval = wsp->walk_callback(this, &this, wsp->walk_cbdata);
+
+ if (rval != WALK_NEXT)
+ return (rval);
+ }
+
+ return (wsp->walk_layer != NULL ? WALK_NEXT : WALK_DONE);
+}
+
/*ARGSUSED*/
static int
-umem_init_walkers(uintptr_t addr, const umem_cache_t *c, void *ignored)
+umem_init_walkers(uintptr_t addr, const umem_cache_t *c, int *sizes)
{
mdb_walker_t w;
char descr[64];
+ char name[64];
+ int i;
(void) mdb_snprintf(descr, sizeof (descr),
"walk the %s cache", c->cache_name);
@@ -124,6 +172,45 @@ umem_init_walkers(uintptr_t addr, const umem_cache_t *c, void *ignored)
if (mdb_add_walker(&w) == -1)
mdb_warn("failed to add %s walker", c->cache_name);
+ if (!(c->cache_flags & UMF_PTC))
+ return (WALK_NEXT);
+
+ /*
+ * For the per-thread cache walker, the address is the offset in the
+ * tm_roots[] array of the ulwp_t.
+ */
+ for (i = 0; sizes[i] != 0; i++) {
+ if (sizes[i] == c->cache_bufsize)
+ break;
+ }
+
+ if (sizes[i] == 0) {
+ mdb_warn("cache %s is cached per-thread, but could not find "
+ "size in umem_alloc_sizes\n", c->cache_name);
+ return (WALK_NEXT);
+ }
+
+ if (i >= NTMEMBASE) {
+ mdb_warn("index for %s (%d) exceeds root slots (%d)\n",
+ c->cache_name, i, NTMEMBASE);
+ return (WALK_NEXT);
+ }
+
+ (void) mdb_snprintf(name, sizeof (name),
+ "umem_ptc_%d", c->cache_bufsize);
+ (void) mdb_snprintf(descr, sizeof (descr),
+ "walk the per-thread cache for %s", c->cache_name);
+
+ w.walk_name = name;
+ w.walk_descr = descr;
+ w.walk_init = umem_ptc_walk_init;
+ w.walk_step = umem_ptc_walk_step;
+ w.walk_fini = NULL;
+ w.walk_init_arg = (void *)offsetof(ulwp_t, ul_tmem.tm_roots[i]);
+
+ if (mdb_add_walker(&w) == -1)
+ mdb_warn("failed to add %s walker", w.walk_name);
+
return (WALK_NEXT);
}
@@ -132,6 +219,8 @@ static void
umem_statechange_cb(void *arg)
{
static int been_ready = 0;
+ GElf_Sym sym;
+ int *sizes;
#ifndef _KMDB
leaky_cleanup(1); /* state changes invalidate leaky state */
@@ -147,7 +236,25 @@ umem_statechange_cb(void *arg)
return;
been_ready = 1;
- (void) mdb_walk("umem_cache", (mdb_walk_cb_t)umem_init_walkers, NULL);
+
+ /*
+ * In order to determine the tm_roots offset of any cache that is
+ * cached per-thread, we need to have the umem_alloc_sizes array.
+ * Read this, assuring that it is zero-terminated.
+ */
+ if (umem_lookup_by_name("umem_alloc_sizes", &sym) == -1) {
+ mdb_warn("unable to lookup 'umem_alloc_sizes'");
+ return;
+ }
+
+ sizes = mdb_zalloc(sym.st_size + sizeof (int), UM_SLEEP | UM_GC);
+
+ if (mdb_vread(sizes, sym.st_size, (uintptr_t)sym.st_value) == -1) {
+ mdb_warn("couldn't read 'umem_alloc_sizes'");
+ return;
+ }
+
+ (void) mdb_walk("umem_cache", (mdb_walk_cb_t)umem_init_walkers, sizes);
}
int
@@ -788,9 +895,9 @@ umem_estimate_allocated(uintptr_t addr, const umem_cache_t *cp)
} \
}
-int
+static int
umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
- void ***maglistp, size_t *magcntp, size_t *magmaxp, int alloc_flags)
+ void ***maglistp, size_t *magcntp, size_t *magmaxp)
{
umem_magazine_t *ump, *mp;
void **maglist = NULL;
@@ -807,7 +914,7 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
*maglistp = NULL;
*magcntp = 0;
*magmaxp = 0;
- return (WALK_NEXT);
+ return (0);
}
/*
@@ -828,11 +935,11 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
if (magbsize >= PAGESIZE / 2) {
mdb_warn("magazine size for cache %p unreasonable (%x)\n",
addr, magbsize);
- return (WALK_ERR);
+ return (-1);
}
- maglist = mdb_alloc(magmax * sizeof (void *), alloc_flags);
- mp = mdb_alloc(magbsize, alloc_flags);
+ maglist = mdb_alloc(magmax * sizeof (void *), UM_SLEEP);
+ mp = mdb_alloc(magbsize, UM_SLEEP);
if (mp == NULL || maglist == NULL)
goto fail;
@@ -875,23 +982,80 @@ umem_read_magazines(umem_cache_t *cp, uintptr_t addr,
dprintf(("magazine layer: %d buffers\n", magcnt));
- if (!(alloc_flags & UM_GC))
- mdb_free(mp, magbsize);
+ mdb_free(mp, magbsize);
*maglistp = maglist;
*magcntp = magcnt;
*magmaxp = magmax;
- return (WALK_NEXT);
+ return (0);
fail:
- if (!(alloc_flags & UM_GC)) {
- if (mp)
- mdb_free(mp, magbsize);
- if (maglist)
- mdb_free(maglist, magmax * sizeof (void *));
+ if (mp)
+ mdb_free(mp, magbsize);
+ if (maglist)
+ mdb_free(maglist, magmax * sizeof (void *));
+
+ return (-1);
+}
+
+typedef struct umem_read_ptc_walk {
+ void **urpw_buf;
+ size_t urpw_cnt;
+ size_t urpw_max;
+} umem_read_ptc_walk_t;
+
+/*ARGSUSED*/
+static int
+umem_read_ptc_walk_buf(uintptr_t addr,
+ const void *ignored, umem_read_ptc_walk_t *urpw)
+{
+ if (urpw->urpw_cnt == urpw->urpw_max) {
+ size_t nmax = urpw->urpw_max ? (urpw->urpw_max << 1) : 1;
+ void **new = mdb_zalloc(nmax * sizeof (void *), UM_SLEEP);
+
+ if (nmax > 1) {
+ size_t osize = urpw->urpw_max * sizeof (void *);
+ bcopy(urpw->urpw_buf, new, osize);
+ mdb_free(urpw->urpw_buf, osize);
+ }
+
+ urpw->urpw_buf = new;
+ urpw->urpw_max = nmax;
}
- return (WALK_ERR);
+
+ urpw->urpw_buf[urpw->urpw_cnt++] = (void *)addr;
+
+ return (WALK_NEXT);
+}
+
+static int
+umem_read_ptc(umem_cache_t *cp,
+ void ***buflistp, size_t *bufcntp, size_t *bufmaxp)
+{
+ umem_read_ptc_walk_t urpw;
+ char walk[60];
+ int rval;
+
+ if (!(cp->cache_flags & UMF_PTC))
+ return (0);
+
+ (void) snprintf(walk, sizeof (walk), "umem_ptc_%d", cp->cache_bufsize);
+
+ urpw.urpw_buf = *buflistp;
+ urpw.urpw_cnt = *bufcntp;
+ urpw.urpw_max = *bufmaxp;
+
+ if ((rval = mdb_walk(walk,
+ (mdb_walk_cb_t)umem_read_ptc_walk_buf, &urpw)) == -1) {
+ mdb_warn("couldn't walk %s", walk);
+ }
+
+ *buflistp = urpw.urpw_buf;
+ *bufcntp = urpw.urpw_cnt;
+ *bufmaxp = urpw.urpw_max;
+
+ return (rval);
}
static int
@@ -1022,13 +1186,19 @@ umem_walk_init_common(mdb_walk_state_t *wsp, int type)
/*
* Read in the contents of the magazine layer
*/
- if (umem_read_magazines(cp, addr, &maglist, &magcnt, &magmax,
- UM_SLEEP) == WALK_ERR)
+ if (umem_read_magazines(cp, addr, &maglist, &magcnt, &magmax) != 0)
+ goto out2;
+
+ /*
+ * Read in the contents of the per-thread caches, if any
+ */
+ if (umem_read_ptc(cp, &maglist, &magcnt, &magmax) != 0)
goto out2;
/*
- * We have all of the buffers from the magazines; if we are walking
- * allocated buffers, sort them so we can bsearch them later.
+ * We have all of the buffers from the magazines and from the
+ * per-thread cache (if any); if we are walking allocated buffers,
+ * sort them so we can bsearch them later.
*/
if (type & UM_ALLOCATED)
qsort(maglist, magcnt, sizeof (void *), addrcmp);
diff --git a/usr/src/cmd/mdb/intel/amd64/libumem/Makefile b/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
index a8352ff865..704ff65873 100644
--- a/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
+++ b/usr/src/cmd/mdb/intel/amd64/libumem/Makefile
@@ -42,6 +42,7 @@ include ../../../../Makefile.cmd
include ../../../../Makefile.cmd.64
CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
CPPFLAGS += -I$(MODSRCS_DIR)
include ../../Makefile.amd64
diff --git a/usr/src/cmd/mdb/intel/ia32/libumem/Makefile b/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
index 2399f51d31..a1ab338f40 100644
--- a/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
+++ b/usr/src/cmd/mdb/intel/ia32/libumem/Makefile
@@ -40,6 +40,7 @@ MODSRCS = \
include ../../../../Makefile.cmd
+CPPFLAGS += -I$(SRC)/lib/libc/inc
CPPFLAGS += -I$(SRC)/lib/libumem/common
CPPFLAGS += -I$(MODSRCS_DIR)
diff --git a/usr/src/cmd/mdb/sparc/v7/libumem/Makefile b/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
index 4553b15eba..906d05d5ea 100644
--- a/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
+++ b/usr/src/cmd/mdb/sparc/v7/libumem/Makefile
@@ -41,6 +41,7 @@ MODSRCS = \
include ../../../../Makefile.cmd
CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
CPPFLAGS += -I$(MODSRCS_DIR)
include ../../Makefile.sparcv7
diff --git a/usr/src/cmd/mdb/sparc/v9/libumem/Makefile b/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
index 2cbeb25f5e..09ea0473c6 100644
--- a/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
+++ b/usr/src/cmd/mdb/sparc/v9/libumem/Makefile
@@ -54,6 +54,7 @@ KMOD_SOURCES_DIFFERENT=$(POUND_SIGN)
include ../../../../Makefile.cmd
CPPFLAGS += -I$(SRC)/lib/libumem/common
+CPPFLAGS += -I$(SRC)/lib/libc/inc
CPPFLAGS += -I$(MODSRCS_DIR)
include ../../../../Makefile.cmd.64
diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile
index 9ddd748eb6..873c2ded87 100644
--- a/usr/src/lib/libc/amd64/Makefile
+++ b/usr/src/lib/libc/amd64/Makefile
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
#
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
@@ -798,6 +799,7 @@ THREADSOBJS= \
assfail.o \
cancel.o \
door_calls.o \
+ tmem.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -1119,6 +1121,7 @@ TIL= \
thread_pool.o \
thrp_unwind.o \
tls.o \
+ tmem.o \
tsd.o
$(TIL:%=pics/%) := CFLAGS64 += $(LIBCBASE)/threads/amd64.il
diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com
index 31a7bc945f..b21f87a0d6 100644
--- a/usr/src/lib/libc/i386/Makefile.com
+++ b/usr/src/lib/libc/i386/Makefile.com
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2013, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
@@ -840,6 +841,7 @@ THREADSOBJS= \
assfail.o \
cancel.o \
door_calls.o \
+ tmem.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
@@ -1177,6 +1179,7 @@ TIL= \
thread_pool.o \
tls.o \
tsd.o \
+ tmem.o \
unwind.o
THREADS_INLINES = $(LIBCBASE)/threads/i386.il
diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h
index 42c08049b2..de0d4a6b05 100644
--- a/usr/src/lib/libc/inc/thr_uberdata.h
+++ b/usr/src/lib/libc/inc/thr_uberdata.h
@@ -22,6 +22,9 @@
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
#ifndef _THR_UBERDATA_H
#define _THR_UBERDATA_H
@@ -488,6 +491,29 @@ typedef struct {
#endif /* _SYSCALL32 */
/*
+ * As part of per-thread caching libumem (ptcumem), we add a small amount to the
+ * thread's uberdata to facilitate it. The tm_roots are the roots of linked
+ * lists which is used by libumem to chain together allocations. tm_size is used
+ * to track the total amount of data stored across those linked lists. For more
+ * information, see libumem's big theory statement.
+ */
+#define NTMEMBASE 16
+
+typedef struct {
+ size_t tm_size;
+ void *tm_roots[NTMEMBASE];
+} tumem_t;
+
+#ifdef _SYSCALL32
+typedef struct {
+ uint32_t tm_size;
+ caddr32_t tm_roots[NTMEMBASE];
+} tumem32_t;
+#endif
+
+typedef void (*tmem_func_t)(void *, int);
+
+/*
* Maximum number of read locks allowed for one thread on one rwlock.
* This could be as large as INT_MAX, but the SUSV3 test suite would
* take an inordinately long time to complete. This is big enough.
@@ -653,6 +679,7 @@ typedef struct ulwp {
#if defined(sparc)
void *ul_unwind_ret; /* used only by _ex_clnup_handler() */
#endif
+ tumem_t ul_tmem; /* used only by umem */
} ulwp_t;
#define ul_cursig ul_cp.s.cursig /* deferred signal number */
@@ -1083,6 +1110,7 @@ typedef struct ulwp32 {
#if defined(sparc)
caddr32_t ul_unwind_ret; /* used only by _ex_clnup_handler() */
#endif
+ tumem32_t ul_tmem; /* used only by umem */
} ulwp32_t;
#define REPLACEMENT_SIZE32 ((size_t)&((ulwp32_t *)NULL)->ul_sigmask)
@@ -1205,6 +1233,7 @@ extern ulwp_t *find_lwp(thread_t);
extern void finish_init(void);
extern void update_sched(ulwp_t *);
extern void queue_alloc(void);
+extern void tmem_exit(void);
extern void tsd_exit(void);
extern void tsd_free(ulwp_t *);
extern void tls_setup(void);
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index 1cdc95be16..1882a337d5 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -25,6 +25,7 @@
# Use is subject to license terms.
#
# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
# Copyright (c) 2013 Gary Mills
#
@@ -2903,6 +2904,9 @@ $endif
thr_wait_mutator;
_thr_wait_mutator;
__tls_get_addr;
+ _tmem_get_base;
+ _tmem_get_nentries;
+ _tmem_set_cleanup;
tpool_create;
tpool_dispatch;
tpool_destroy;
diff --git a/usr/src/lib/libc/port/threads/thr.c b/usr/src/lib/libc/port/threads/thr.c
index ae55fbddf5..b5d848449d 100644
--- a/usr/src/lib/libc/port/threads/thr.c
+++ b/usr/src/lib/libc/port/threads/thr.c
@@ -22,6 +22,9 @@
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
#include "lint.h"
#include "thr_uberdata.h"
@@ -771,6 +774,7 @@ _thrp_exit()
}
lmutex_unlock(&udp->link_lock);
+ tmem_exit(); /* deallocate tmem allocations */
tsd_exit(); /* deallocate thread-specific data */
tls_exit(); /* deallocate thread-local storage */
heldlock_exit(); /* deal with left-over held locks */
diff --git a/usr/src/lib/libc/port/threads/tmem.c b/usr/src/lib/libc/port/threads/tmem.c
new file mode 100644
index 0000000000..00203de593
--- /dev/null
+++ b/usr/src/lib/libc/port/threads/tmem.c
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include "lint.h"
+#include "thr_uberdata.h"
+
+/*
+ * This file implements the private interface with libumem for per-thread
+ * caching umem (ptcumem). For the full details on how tcumem works and how
+ * these functions work, see section 8.4 of the big theory statement in
+ * lib/libumem/common/umem.c.
+ */
+static tmem_func_t tmem_cleanup = NULL;
+
+uintptr_t
+_tmem_get_base(void)
+{
+ return ((uintptr_t)&curthread->ul_tmem - (uintptr_t)curthread);
+}
+
+int
+_tmem_get_nentries(void)
+{
+ return (NTMEMBASE);
+}
+
+void
+_tmem_set_cleanup(tmem_func_t f)
+{
+ tmem_cleanup = f;
+}
+
+/*
+ * This is called by _thrp_exit() to clean up any per-thread allocations that
+ * are still hanging around and haven't been cleaned up.
+ */
+void
+tmem_exit(void)
+{
+ int ii;
+ void *buf, *next;
+ tumem_t *tp = &curthread->ul_tmem;
+
+
+ if (tp->tm_size == 0)
+ return;
+
+ /*
+ * Since we have something stored here, we need to ensure we declared a
+ * clean up handler. If we haven't that's broken and our single private
+ * consumer should be shot.
+ */
+ if (tmem_cleanup == NULL)
+ abort();
+ for (ii = 0; ii < NTMEMBASE; ii++) {
+ buf = tp->tm_roots[ii];
+ while (buf != NULL) {
+ next = *(void **)buf;
+ tmem_cleanup(buf, ii);
+ buf = next;
+ }
+ }
+}
diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com
index cc6bae0df4..25482d7324 100644
--- a/usr/src/lib/libc/sparc/Makefile.com
+++ b/usr/src/lib/libc/sparc/Makefile.com
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
@@ -875,6 +876,7 @@ THREADSOBJS= \
assfail.o \
cancel.o \
door_calls.o \
+ tmem.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
diff --git a/usr/src/lib/libc/sparcv9/Makefile.com b/usr/src/lib/libc/sparcv9/Makefile.com
index 54b3258fe9..fe6844273e 100644
--- a/usr/src/lib/libc/sparcv9/Makefile.com
+++ b/usr/src/lib/libc/sparcv9/Makefile.com
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
@@ -822,6 +823,7 @@ THREADSOBJS= \
assfail.o \
cancel.o \
door_calls.o \
+ tmem.o \
pthr_attr.o \
pthr_barrier.o \
pthr_cond.o \
diff --git a/usr/src/lib/libumem/Makefile.com b/usr/src/lib/libumem/Makefile.com
index 0e726c5646..61f7e9503d 100644
--- a/usr/src/lib/libumem/Makefile.com
+++ b/usr/src/lib/libumem/Makefile.com
@@ -22,6 +22,8 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+#
#
# The build process for libumem is sightly different from that used by other
@@ -65,10 +67,12 @@ SRCS_standalone = $(OBJECTS_standalone:%.o=../common/%.c)
# Architecture-dependent files common to both versions of libumem
OBJECTS_common_isadep = \
- asm_subr.o
+ asm_subr.o \
+ umem_genasm.o
SRCS_common_isadep = \
- $(ISASRCDIR)/asm_subr.s
+ $(ISASRCDIR)/asm_subr.s \
+ $(ISASRCDIR)/umem_genasm.c
# Architecture-independent files common to both versions of libumem
OBJECTS_common_common = \
@@ -140,6 +144,7 @@ DTS_ERRNO=
STAND_RENAMED_FUNCS= \
atomic_add_64 \
atomic_add_32_nv \
+ atomic_swap_64 \
snprintf \
vsnprintf
diff --git a/usr/src/lib/libumem/amd64/umem_genasm.c b/usr/src/lib/libumem/amd64/umem_genasm.c
new file mode 100644
index 0000000000..00cc18ab67
--- /dev/null
+++ b/usr/src/lib/libumem/amd64/umem_genasm.c
@@ -0,0 +1,604 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you find the blocks of assembly that follow confusing and
+ * you're questioning why they exist, please go read section 8 of the umem.c big
+ * theory statement. Next familiarize yourself with the malloc and free
+ * implementations in libumem's malloc.c.
+ *
+ * What follows is the amd64 implementation of the thread caching automatic
+ * assembly generation. The amd64 calling conventions are documented in the
+ * 64-bit System V ABI. For our purposes what matters is that our first argument
+ * will come in rdi. Our functions have to preserve rbp, rbx, and r12->r15. We
+ * are free to do whatever we want with rax, rcx, rdx, rsi, rdi, and r8->r11.
+ *
+ * For both our implementation of malloc and free we only use the registers we
+ * don't have to preserve.
+ *
+ * Malloc register usage:
+ * o. rdi: Original size to malloc. This never changes and is preserved.
+ * o. rsi: Adjusted malloc size for malloc_data_tag(s).
+ * o. rcx: Pointer to the tmem_t in the ulwp_t.
+ * o. rdx: Pointer to the tmem_t array of roots
+ * o. r8: Size of the cache
+ * o. r9: Scratch register
+ *
+ * Free register usage:
+ * o. rdi: Original buffer to free. This never changes and is preserved.
+ * o. rax: The actual buffer, adjusted for the hidden malloc_data_t(s).
+ * o. rcx: Pointer to the tmem_t in the ulwp_t.
+ * o. rdx: Pointer to the tmem_t array of roots
+ * o. r8: Size of the cache
+ * o. r9: Scratch register
+ *
+ * Once we determine what cache we are using, we increment %rdx to the
+ * appropriate offset and set %r8 with the size of the cache. This means that
+ * when we break out to the normal buffer allocation point %rdx contains the
+ * head of the linked list and %r8 is the amount that we have to adjust the
+ * thread's cached amount by.
+ *
+ * Each block of assembly has psuedocode that describes its purpose.
+ */
+
+#include <atomic.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <strings.h>
+#include <umem_impl.h>
+#include "umem_base.h"
+
+#include <stdio.h>
+
+const int umem_genasm_supported = 1;
+static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
+static size_t umem_genasm_msize = 576;
+static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
+static size_t umem_genasm_fsize = 576;
+static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
+static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
+
+#define UMEM_GENASM_MAX64 (UINT32_MAX / sizeof (uintptr_t))
+#define PTC_JMPADDR(dest, src) (dest - (src + 4))
+#define PTC_ROOT_SIZE sizeof (uintptr_t)
+#define MULTINOP 0x0000441f0f
+
+/*
+ * void *ptcmalloc(size_t orig_size);
+ *
+ * size_t size = orig_size + 8;
+ * if (size > UMEM_SECOND_ALIGN)
+ * size += 8;
+ *
+ * if (size < orig_size)
+ * goto tomalloc; ! This is overflow
+ *
+ * if (size > cache_max)
+ * goto tomalloc
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define PTC_MALINIT_JOUT 0x13
+#define PTC_MALINIT_MCS 0x1a
+#define PTC_MALINIT_JOV 0x20
+#define PTC_MALINIT_SOFF 0x30
+static const uint8_t malinit[] = {
+ 0x48, 0x8d, 0x77, 0x08, /* leaq 0x8(%rdi),%rsi */
+ 0x48, 0x83, 0xfe, 0x10, /* cmpq $0x10, %rsi */
+ 0x76, 0x04, /* jbe +0x4 */
+ 0x48, 0x8d, 0x77, 0x10, /* leaq 0x10(%rdi),%rsi */
+ 0x48, 0x39, 0xfe, /* cmpq %rdi,%rsi */
+ 0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, /* jb +errout */
+ 0x48, 0x81, 0xfe,
+ 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
+ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +errout */
+ 0x64, 0x48, 0x8b, 0x0c, 0x25,
+ 0x00, 0x00, 0x00, 0x00, /* movq %fs:0x0,%rcx */
+ 0x48, 0x81, 0xc1,
+ 0x00, 0x00, 0x00, 0x00, /* addq $SOFF, %rcx */
+ 0x48, 0x8d, 0x51, 0x08, /* leaq 0x8(%rcx),%rdx */
+};
+
+/*
+ * void ptcfree(void *buf);
+ *
+ * if (buf == NULL)
+ * return;
+ *
+ * malloc_data_t *tag = buf;
+ * tag--;
+ * int size = tag->malloc_size;
+ * int tagval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
+ * if (tagval == MALLOC_SECOND_MAGIC) {
+ * tag--;
+ * } else if (tagval != MALLOC_MAGIC) {
+ * goto tofree;
+ * }
+ *
+ * if (size > cache_max)
+ * goto tofree;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define PTC_FRINI_JDONE 0x05
+#define PTC_FRINI_JFREE 0x25
+#define PTC_FRINI_MCS 0x30
+#define PTC_FRINI_JOV 0x36
+#define PTC_FRINI_SOFF 0x46
+static const uint8_t freeinit[] = {
+ 0x48, 0x85, 0xff, /* testq %rdi,%rdi */
+ 0x0f, 0x84, 0x00, 0x00, 0x00, 0x00, /* jmp $JDONE (done) */
+ 0x8b, 0x77, 0xf8, /* movl -0x8(%rdi),%esi */
+ 0x8b, 0x47, 0xfc, /* movl -0x4(%rdi),%eax */
+ 0x01, 0xf0, /* addl %esi,%eax */
+ 0x3d, 0x00, 0x70, 0xba, 0x16, /* cmpl $MALLOC_2_MAGIC, %eax */
+ 0x75, 0x06, /* jne +0x6 (checkover) */
+ 0x48, 0x8d, 0x47, 0xf0, /* leaq -0x10(%rdi),%eax */
+ 0xeb, 0x0f, /* jmp +0xf (freebuf) */
+ 0x3d, 0x00, 0xc0, 0x10, 0x3a, /* cmpl $MALLOC_MAGIC, %eax */
+ 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, /* jmp +JFREE (goto torfree) */
+ 0x48, 0x8d, 0x47, 0xf8, /* leaq -0x8(%rdi),%rax */
+ 0x48, 0x81, 0xfe,
+ 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
+ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +errout */
+ 0x64, 0x48, 0x8b, 0x0c, 0x25,
+ 0x00, 0x00, 0x00, 0x00, /* movq %fs:0x0,%rcx */
+ 0x48, 0x81, 0xc1,
+ 0x00, 0x00, 0x00, 0x00, /* addq $SOFF, %rcx */
+ 0x48, 0x8d, 0x51, 0x08, /* leaq 0x8(%rcx),%rdx */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * } else ... ! goto next cache
+ */
+#define PTC_INICACHE_CMP 0x03
+#define PTC_INICACHE_SIZE 0x0c
+#define PTC_INICACHE_JMP 0x11
+static const uint8_t inicache[] = {
+ 0x48, 0x81, 0xfe,
+ 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
+ 0x77, 0x0c, /* ja +0xc (next cache) */
+ 0x49, 0xc7, 0xc0,
+ 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
+ 0xe9, 0x00, 0x00, 0x00, 0x00, /* jmp $JMP (allocbuf) */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * roots += $CACHE_NUM;
+ * } else ... ! goto next cache
+ */
+#define PTC_GENCACHE_CMP 0x03
+#define PTC_GENCACHE_SIZE 0x0c
+#define PTC_GENCACHE_NUM 0x13
+#define PTC_GENCACHE_JMP 0x18
+static const uint8_t gencache[] = {
+ 0x48, 0x81, 0xfe,
+ 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
+ 0x77, 0x14, /* ja +0xc (next cache) */
+ 0x49, 0xc7, 0xc0,
+ 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
+ 0x48, 0x81, 0xc2,
+ 0x00, 0x00, 0x00, 0x00, /* addq $8*ii, %rdx */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf ) */
+};
+
+/*
+ * else if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * roots += $CACHE_NUM;
+ * } else {
+ * goto tofunc; ! goto tomalloc if ptcmalloc.
+ * } ! goto tofree if ptcfree.
+ */
+#define PTC_FINCACHE_CMP 0x03
+#define PTC_FINCACHE_JMP 0x08
+#define PTC_FINCACHE_SIZE 0x0c
+#define PTC_FINCACHE_NUM 0x13
+static const uint8_t fincache[] = {
+ 0x48, 0x81, 0xfe,
+ 0x00, 0x00, 0x00, 0x00, /* cmpq sizeof ($CACHE), %rsi */
+ 0x77, 0x00, /* ja +JMP (to real malloc) */
+ 0x49, 0xc7, 0xc0,
+ 0x00, 0x00, 0x00, 0x00, /* movq sizeof ($CACHE), %r8 */
+ 0x48, 0x81, 0xc2,
+ 0x00, 0x00, 0x00, 0x00, /* addq $8*ii, %rdx */
+
+};
+
+/*
+ * if (*root == NULL)
+ * goto tomalloc;
+ *
+ * malloc_data_t *ret = *root;
+ * *root = *(void **)ret;
+ * t->tm_size += csize;
+ * ret->malloc_size = size;
+ *
+ * if (size > UMEM_SECOND_ALIGN) {
+ * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ * ret += 2;
+ * } else {
+ * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ * ret += 1;
+ * }
+ *
+ * return ((void *)ret);
+ * tomalloc:
+ * return (malloc(orig_size));
+ */
+#define PTC_MALFINI_ALLABEL 0x00
+#define PTC_MALFINI_JMLABEL 0x40
+#define PTC_MALFINI_JMADDR 0x41
+static const uint8_t malfini[] = {
+ 0x48, 0x8b, 0x02, /* movl (%rdx),%rax */
+ 0x48, 0x85, 0xc0, /* testq %rax,%rax */
+ 0x74, 0x38, /* je +0x38 (errout) */
+ 0x4c, 0x8b, 0x08, /* movq (%rax),%r9 */
+ 0x4c, 0x89, 0x0a, /* movq %r9,(%rdx) */
+ 0x4c, 0x29, 0x01, /* subq %rsi,(%rcx) */
+ 0x48, 0x83, 0xfe, 0x10, /* cmpq $0x10,%rsi */
+ 0x76, 0x15, /* jbe +0x15 */
+ 0x41, 0xb9, 0x00, 0x70, 0xba, 0x16, /* movl $MALLOC_MAGIC_2, %r9d */
+ 0x89, 0x70, 0x08, /* movl %r9d,0x8(%rax) */
+ 0x41, 0x29, 0xf1, /* subl %esi, %r9d */
+ 0x44, 0x89, 0x48, 0x0c, /* movl %r9d, 0xc(%rax) */
+ 0x48, 0x83, 0xc0, 0x10, /* addq $0x10, %rax */
+ 0xc3, /* ret */
+ 0x41, 0xb9, 0x00, 0xc0, 0x10, 0x3a, /* movl %MALLOC_MAGIC, %r9d */
+ 0x89, 0x30, /* movl %esi,(%rax) */
+ 0x41, 0x29, 0xf1, /* subl %esi,%r9d */
+ 0x44, 0x89, 0x48, 0x04, /* movl %r9d,0x4(%rax) */
+ 0x48, 0x83, 0xc0, 0x08, /* addq $0x8,%rax */
+ 0xc3, /* ret */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp $MALLOC */
+};
+
+/*
+ * if (t->tm_size + csize > umem_ptc_size)
+ * goto tofree;
+ *
+ * t->tm_size += csize
+ * *(void **)tag = *root;
+ * *root = tag;
+ * return;
+ * tofree:
+ * free(buf);
+ * return;
+ */
+#define PTC_FRFINI_RBUFLABEL 0x00
+#define PTC_FRFINI_CACHEMAX 0x09
+#define PTC_FRFINI_DONELABEL 0x1b
+#define PTC_FRFINI_JFLABEL 0x1c
+#define PTC_FRFINI_JFADDR 0x1d
+static const uint8_t freefini[] = {
+ 0x4c, 0x8b, 0x09, /* movq (%rcx),%r9 */
+ 0x4d, 0x01, 0xc1, /* addq %r8, %r9 */
+ 0x49, 0x81, 0xf9,
+ 0x00, 0x00, 0x00, 0x00, /* cmpl $THR_CACHE_MAX, %r9 */
+ 0x77, 0x0d, /* jae +0xd (torfree) */
+ 0x4c, 0x01, 0x01, /* addq %r8,(%rcx) */
+ 0x4c, 0x8b, 0x0a, /* movq (%rdx),%r9 */
+ 0x4c, 0x89, 0x08, /* movq %r9,(%rax) */
+ 0x48, 0x89, 0x02, /* movq %rax,(%rdx) */
+ 0xc3, /* ret */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp free */
+};
+
+/*
+ * Construct the initial part of malloc. off contains the offset from curthread
+ * to the root of the tmem structure. ep is the address of the label to error
+ * and jump to free. csize is the size of the largest umem_cache in ptcumem.
+ */
+static int
+genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
+{
+ uint32_t addr;
+
+ bcopy(malinit, bp, sizeof (malinit));
+ addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
+ bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
+ bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
+ addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
+ bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
+ bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
+
+ return (sizeof (malinit));
+}
+
+static int
+genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mcs)
+{
+ uint32_t addr;
+
+ bcopy(freeinit, bp, sizeof (freeinit));
+ addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
+ bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
+ addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
+ bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
+ bcopy(&mcs, bp + PTC_FRINI_MCS, sizeof (mcs));
+ addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
+ bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
+ bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
+ return (sizeof (freeinit));
+}
+
+
+/*
+ * Create the initial cache entry of the specified size. The value of ap tells
+ * us what the address of the label to try and allocate a buffer. This value is
+ * an offset from the current base to that value.
+ */
+static int
+genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
+{
+ uint32_t addr;
+
+ bcopy(inicache, bp, sizeof (inicache));
+ bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
+ addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
+ ASSERT(addr != 0);
+ bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
+
+ return (sizeof (inicache));
+}
+
+static int
+genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
+{
+ uint32_t addr;
+ uint32_t coff;
+
+ ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
+ ASSERT(num != 0);
+ bcopy(gencache, bp, sizeof (gencache));
+ bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
+ coff = num * PTC_ROOT_SIZE;
+ bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
+ addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
+ bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
+
+ return (sizeof (gencache));
+}
+
+static int
+genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
+{
+ uint8_t eap;
+ uint32_t coff;
+
+ ASSERT(ep <= 0xff && ep > 7);
+ ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
+ bcopy(fincache, bp, sizeof (fincache));
+ bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
+ coff = num * PTC_ROOT_SIZE;
+ bcopy(&coff, bp + PTC_FINCACHE_NUM, sizeof (coff));
+ eap = ep - PTC_FINCACHE_JMP - 1;
+ bcopy(&eap, bp + PTC_FINCACHE_JMP, sizeof (eap));
+
+ return (sizeof (fincache));
+}
+
+static int
+genasm_malfini(uint8_t *bp, uintptr_t mptr)
+{
+ uint32_t addr;
+
+ bcopy(malfini, bp, sizeof (malfini));
+ addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
+ bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
+
+ return (sizeof (malfini));
+}
+
+static int
+genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
+{
+ uint32_t addr;
+
+ bcopy(freefini, bp, sizeof (freefini));
+ bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
+ addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
+ bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
+
+ return (sizeof (freefini));
+}
+
+/*
+ * The malloc inline assembly is constructed as follows:
+ *
+ * o Malloc prologue assembly
+ * o Generic first-cache check
+ * o n Generic cache checks (where n = _tmem_get_entries() - 2)
+ * o Generic last-cache check
+ * o Malloc epilogue assembly
+ *
+ * Generally there are at least three caches. When there is only one cache we
+ * only use the generic last-cache. In the case where there are two caches, we
+ * just leave out the middle ones.
+ */
+static int
+genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+ int ii, off;
+ uint8_t *bp;
+ size_t total;
+ uint32_t allocoff, erroff;
+
+ total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
+
+ if (nents >= 2)
+ total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+ if (total > len)
+ return (1);
+
+ erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
+ allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
+
+ bp = base;
+
+ off = genasm_malinit(bp, umem_tmem_off, erroff,
+ umem_alloc_sizes[nents-1]);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+
+ if (nents > 1) {
+ off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+ }
+
+ for (ii = 1; ii < nents - 1; ii++) {
+ off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+ }
+
+ bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+ erroff);
+ bp += genasm_malfini(bp, umem_genasm_omptr);
+ ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+ return (0);
+}
+
+static int
+genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+ uint8_t *bp;
+ int ii, off;
+ size_t total;
+ uint32_t rbufoff, retoff, erroff;
+
+ /* Assume that nents has already been audited for us */
+ total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
+ if (nents >= 2)
+ total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+ if (total > len)
+ return (1);
+
+ erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
+ rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
+ retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
+
+ bp = base;
+
+ off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
+ umem_alloc_sizes[nents - 1]);
+ bp += off;
+ erroff -= off;
+ rbufoff -= off;
+
+ if (nents > 1) {
+ off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
+ bp += off;
+ erroff -= off;
+ rbufoff -= off;
+ }
+
+ for (ii = 1; ii < nents - 1; ii++) {
+ off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
+ bp += off;
+ rbufoff -= off;
+ erroff -= off;
+ }
+
+ bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+ erroff);
+ bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
+ ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+umem_genasm(int *cp, umem_cache_t **caches, int nc)
+{
+ int nents, i;
+ uint8_t *mptr;
+ uint8_t *fptr;
+ uint64_t v, *vptr;
+
+ mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
+ fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
+ if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
+ umem_genasm_fptr == 0 || umem_genasm_fsize == 0)
+ return (1);
+
+ /*
+ * The total number of caches that we can service is the minimum of:
+ * o the amount supported by libc
+ * o the total number of umem caches
+ * o we use a single byte addl, so it's MAX_UINT32 / sizeof (uintptr_t)
+ * For 64-bit, this is MAX_UINT32 >> 3, a lot.
+ */
+ nents = _tmem_get_nentries();
+
+ if (UMEM_GENASM_MAX64 < nents)
+ nents = UMEM_GENASM_MAX64;
+
+ if (nc < nents)
+ nents = nc;
+
+ /* Based on our constraints, this is not an error */
+ if (nents == 0 || umem_ptc_size == 0)
+ return (0);
+
+ /* Take into account the jump */
+ if (genasm_malloc(mptr, umem_genasm_msize, nents, cp) != 0)
+ return (1);
+
+ if (genasm_free(fptr, umem_genasm_fsize, nents, cp) != 0)
+ return (1);
+
+
+ /* nop out the jump with a multibyte jump */
+ vptr = (void *)umem_genasm_mptr;
+ v = MULTINOP;
+ v |= *vptr & (0xffffffULL << 40);
+ (void) atomic_swap_64(vptr, v);
+ vptr = (void *)umem_genasm_fptr;
+ v = MULTINOP;
+ v |= *vptr & (0xffffffULL << 40);
+ (void) atomic_swap_64(vptr, v);
+
+ for (i = 0; i < nents; i++)
+ caches[i]->cache_flags |= UMF_PTC;
+
+ return (0);
+}
diff --git a/usr/src/lib/libumem/common/envvar.c b/usr/src/lib/libumem/common/envvar.c
index fc3d490a01..0c4d872814 100644
--- a/usr/src/lib/libumem/common/envvar.c
+++ b/usr/src/lib/libumem/common/envvar.c
@@ -22,7 +22,10 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
*/
#include <ctype.h>
@@ -151,7 +154,10 @@ static umem_env_item_t umem_options_items[] = {
NULL, 0, NULL, &vmem_sbrk_pagesize
},
#endif
-
+ { "perthread_cache", "Evolving", ITEM_SIZE,
+ "Size (in bytes) of per-thread allocation cache",
+ NULL, 0, NULL, &umem_ptc_size
+ },
{ NULL, "-- end of UMEM_OPTIONS --", ITEM_INVALID }
};
diff --git a/usr/src/lib/libumem/common/linktest_stand.c b/usr/src/lib/libumem/common/linktest_stand.c
index 8ae9fdbec8..dd8333828b 100644
--- a/usr/src/lib/libumem/common/linktest_stand.c
+++ b/usr/src/lib/libumem/common/linktest_stand.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file is used to verify that the standalone's external dependencies
* haven't changed in a way that'll break things that use it.
@@ -34,6 +32,7 @@
void __umem_assert_failed(void) {}
void atomic_add_64(void) {}
void atomic_add_32_nv(void) {}
+void atomic_swap_64(void) {}
void dladdr1(void) {}
void bcopy(void) {}
void bzero(void) {}
diff --git a/usr/src/lib/libumem/common/malloc.c b/usr/src/lib/libumem/common/malloc.c
index 906f369d29..3d19e5b320 100644
--- a/usr/src/lib/libumem/common/malloc.c
+++ b/usr/src/lib/libumem/common/malloc.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <unistd.h>
#include <errno.h>
#include <string.h>
@@ -50,8 +48,17 @@ typedef struct malloc_data {
uint32_t malloc_stat; /* = UMEM_MALLOC_ENCODE(state, malloc_size) */
} malloc_data_t;
+/*
+ * Because we do not support ptcumem on non-x86 today, we have to create these
+ * weak aliases.
+ */
+#ifndef _x86
+#pragma weak malloc = umem_malloc
+#pragma weak free = umem_malloc_free
+#endif /* !_x86 */
+
void *
-malloc(size_t size_arg)
+umem_malloc(size_t size_arg)
{
#ifdef _LP64
uint32_t high_size = 0;
@@ -369,7 +376,7 @@ process_memalign:
}
void
-free(void *buf)
+umem_malloc_free(void *buf)
{
if (buf == NULL)
return;
diff --git a/usr/src/lib/libumem/common/mapfile-vers b/usr/src/lib/libumem/common/mapfile-vers
index 102bd989f7..888a1570f2 100644
--- a/usr/src/lib/libumem/common/mapfile-vers
+++ b/usr/src/lib/libumem/common/mapfile-vers
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
#
#
@@ -38,6 +39,17 @@
$mapfile_version 2
+$if _x86
+LOAD_SEGMENT umem {
+ FLAGS = READ WRITE EXECUTE;
+ ASSIGN_SECTION {
+ IS_NAME = .text;
+ FILE_BASENAME = asm_subr.o
+ };
+};
+$endif
+
+
SYMBOL_VERSION SUNW_1.1 {
global:
calloc { FLAGS = NODIRECT };
diff --git a/usr/src/lib/libumem/common/stub_stand.c b/usr/src/lib/libumem/common/stub_stand.c
index 54635558c3..2c82364ef1 100644
--- a/usr/src/lib/libumem/common/stub_stand.c
+++ b/usr/src/lib/libumem/common/stub_stand.c
@@ -23,6 +23,9 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
/*
* Stubs for the standalone to reduce the dependence on external libraries
@@ -125,3 +128,21 @@ issetugid(void)
{
return (1);
}
+
+int
+_tmem_get_nentries(void)
+{
+ return (0);
+}
+
+uintptr_t
+_tmem_get_base(void)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+_tmem_set_cleanup(void (*f)(int, void *))
+{
+}
diff --git a/usr/src/lib/libumem/common/umem.c b/usr/src/lib/libumem/common/umem.c
index 9ee030dd47..00028e5f80 100644
--- a/usr/src/lib/libumem/common/umem.c
+++ b/usr/src/lib/libumem/common/umem.c
@@ -21,11 +21,14 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Copyright 2012 Joyent, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
* based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
*
* The slab allocator, as described in the following two papers:
@@ -43,7 +46,7 @@
*
* 1. Overview
* -----------
- * umem is very close to kmem in implementation. There are four major
+ * umem is very close to kmem in implementation. There are seven major
* areas of divergence:
*
* * Initialization
@@ -56,6 +59,10 @@
*
* * lock ordering
*
+ * * changing UMEM_MAXBUF
+ *
+ * * Per-thread caching for malloc/free
+ *
* 2. Initialization
* -----------------
* kmem is initialized early on in boot, and knows that no one will call
@@ -365,6 +372,232 @@
*
* The second place to update, which is not required, is the umem_alloc_sizes.
* These determine the default cache sizes that we're going to support.
+ *
+ * 8. Per-thread caching for malloc/free
+ * -------------------------------------
+ *
+ * "Time is an illusion. Lunchtime doubly so." -- Douglas Adams
+ *
+ * Time may be an illusion, but CPU cycles aren't. While libumem is designed
+ * to be a highly scalable allocator, that scalability comes with a fixed cycle
+ * penalty even in the absence of contention: libumem must acquire (and release
+ * a per-CPU lock for each allocation. When contention is low and malloc(3C)
+ * frequency is high, this overhead can dominate execution time. To alleviate
+ * this, we allow for per-thread caching, a lock-free means of caching recent
+ * deallocations on a per-thread basis for use in satisfying subsequent calls
+ *
+ * In addition to improving performance, we also want to:
+ * * Minimize fragmentation
+ * * Not add additional memory overhead (no larger malloc tags)
+ *
+ * In the ulwp_t of each thread there is a private data structure called a
+ * umem_t that looks like:
+ *
+ * typedef struct {
+ * size_t tm_size;
+ * void *tm_roots[NTMEMBASE]; (Currently 16)
+ * } tmem_t;
+ *
+ * Each of the roots is treated as the head of a linked list. Each entry in the
+ * list can be thought of as a void ** which points to the next entry, until one
+ * of them points to NULL. If the head points to NULL, the list is empty.
+ *
+ * Each head corresponds to a umem_cache. Currently there is a linear mapping
+ * where the first root corresponds to the first cache, second root to the
+ * second cache, etc. This works because every allocation that malloc makes to
+ * umem_alloc that can be satisified by a umem_cache will actually return a
+ * number of bytes equal to the size of that cache. Because of this property and
+ * a one to one mapping between caches and roots we can guarantee that every
+ * entry in a given root's list will be able to satisfy the same requests as the
+ * corresponding cache.
+ *
+ * The choice of sixteen roots is based on where we believe we get the biggest
+ * bang for our buck. The per-thread caches will cache up to 256 byte and 448
+ * byte allocations on ILP32 and LP64 respectively. Generally applications plan
+ * more carefully how they do larger allocations than smaller ones. Therefore
+ * sixteen roots is a reasonable compromise between the amount of additional
+ * overhead per thread, and the likelihood of a program to benefit from it.
+ *
+ * The maximum amount of memory that can be cached in each thread is determined
+ * by the perthread_cache UMEM_OPTION. It corresponds to the umem_ptc_size
+ * value. The default value for this is currently 1 MB. Once umem_init() has
+ * finished this cannot be directly tuned without directly modifying the
+ * instruction text. If, upon calling free(3C), the amount cached would exceed
+ * this maximum, we instead actually return the buffer to the umem_cache instead
+ * of holding onto it in the thread.
+ *
+ * When a thread calls malloc(3C) it first determines which umem_cache it
+ * would be serviced by. If the allocation is not covered by ptcumem it goes to
+ * the normal malloc instead. Next, it checks if the tmem_root's list is empty
+ * or not. If it is empty, we instead go and allocate the memory from
+ * umem_alloc. If it is not empty, we remove the head of the list, set the
+ * appropriate malloc tags, and return that buffer.
+ *
+ * When a thread calls free(3C) it first looks at the malloc tag and if it is
+ * invalid or the allocation exceeds the largest cache in ptcumem and sends it
+ * off to the original free() to handle and clean up appropriately. Next, it
+ * checks if the allocation size is covered by one of the per-thread roots and
+ * if it isn't, it passes it off to the original free() to be released. Finally,
+ * before it inserts this buffer as the head, it checks if adding this buffer
+ * would put the thread over its maximum cache size. If it would, it frees the
+ * buffer back to the umem_cache. Otherwise it increments the threads total
+ * cached amount and makes the buffer the new head of the appropriate tm_root.
+ *
+ * When a thread exits, all of the buffers that it has in its per-thread cache
+ * will be passed to umem_free() and returned to the appropriate umem_cache.
+ *
+ * 8.1 Handling addition and removal of umem_caches
+ * ------------------------------------------------
+ *
+ * The set of umem_caches that are used to back calls to umem_alloc() and
+ * ultimately malloc() are determined at program execution time. The default set
+ * of caches is defined below in umem_alloc_sizes[]. Various umem_options exist
+ * that modify the set of caches: size_add, size_clear, and size_remove. Because
+ * the set of caches can only be determined once umem_init() has been called and
+ * we have the additional goals of minimizing additional fragmentation and
+ * metadata space overhead in the malloc tags, this forces our hand to go down a
+ * slightly different path: the one tread by fasttrap and trapstat.
+ *
+ * During umem_init we're going to dynamically construct a new version of
+ * malloc(3C) and free(3C) that utilizes the known cache sizes and then ensure
+ * that ptcmalloc and ptcfree replace malloc and free as entries in the plt. If
+ * ptcmalloc and ptcfree cannot handle a request, they simply jump to the
+ * original libumem implementations.
+ *
+ * After creating all of the umem_caches, but before making them visible,
+ * umem_cache_init checks that umem_genasm_supported is non-zero. This value is
+ * set by each architecture in $ARCH/umem_genasm.c to indicate whether or not
+ * they support this. If the value is zero, then this process is skipped.
+ * Similarly, if the cache size has been tuned to zero by UMEM_OPTIONS, then
+ * this is also skipped.
+ *
+ * In umem_genasm.c, each architecture's implementation implements a single
+ * function called umem_genasm() that is responsible for generating the
+ * appropriate versions of ptcmalloc() and ptcfree(), placing them in the
+ * appropriate memory location, and finally doing the switch from malloc() and
+ * free() to ptcmalloc() and ptcfree(). Once the change has been made, there is
+ * no way to switch back, short of restarting the program or modifying program
+ * text with mdb.
+ *
+ * 8.2 Modifying the Procedure Linkage Table (PLT)
+ * -----------------------------------------------
+ *
+ * The last piece of this puzzle is how we actually jam ptcmalloc() into the
+ * PLT. To handle this, we have defined two functions, _malloc and _free and
+ * used a special mapfile directive to place them into the a readable,
+ * writeable, and executable segment. Next we use a standard #pragma weak for
+ * malloc and free and direct them to those symbols. By default, those symbols
+ * have text defined as nops for our generated functions and when they're
+ * invoked, they jump to the default malloc and free functions.
+ *
+ * When umem_genasm() is called, it goes through and generates new malloc() and
+ * free() functions in the text provided for by _malloc and _free just after the
+ * jump. Once both have been successfully generated, umem_genasm() nops over the
+ * original jump so that we now call into the genasm versions of these
+ * functions.
+ *
+ * 8.3 umem_genasm()
+ * -----------------
+ *
+ * umem_genasm() is currently implemented for i386 and amd64. This section
+ * describes the theory behind the construction. For specific byte code to
+ * assembly instructions and niceish C and asm versions of ptcmalloc and
+ * ptcfree, see the individual umem_genasm.c files. The layout consists of the
+ * following sections:
+ *
+ * o. function-specfic prologue
+ * o. function-generic cache-selecting elements
+ * o. function-specific epilogue
+ *
+ * There are three different generic cache elements that exist:
+ *
+ * o. the last or only cache
+ * o. the intermediary caches if more than two
+ * o. the first one if more than one cache
+ *
+ * The malloc and free prologues and epilogues mimic the necessary portions of
+ * libumem's malloc and free. This includes things like checking for size
+ * overflow, setting and verifying the malloc tags.
+ *
+ * It is an important constraint that these functions do not make use of the
+ * call instruction. The only jmp outside of the individual functions is to the
+ * original libumem malloc and free respectively. Because doing things like
+ * setting errno or raising an internal umem error on improper malloc tags would
+ * require using calls into the PLT, whenever we encounter one of those cases we
+ * just jump to the original malloc and free functions reusing the same stack
+ * frame.
+ *
+ * Each of the above sections, the three caches, and the malloc and free
+ * prologue and epilogue are implemented as blocks of machine code with the
+ * corresponding assembly in comments. There are known offsets into each block
+ * that corresponds to locations of data and addresses that we only know at run
+ * time. These blocks are copied as necessary and the blanks filled in
+ * appropriately.
+ *
+ * As mentioned in section 8.2, the trampoline library uses specifically named
+ * variables to communicate the buffers and size to use. These variables are:
+ *
+ * o. umem_genasm_mptr: The buffer for ptcmalloc
+ * o. umem_genasm_msize: The size in bytes of the above buffer
+ * o. umem_genasm_fptr: The buffer for ptcfree
+ * o. umem_genasm_fsize: The size in bytes of the above buffer
+ *
+ * Finally, to enable the generated assembly we need to remove the previous jump
+ * to the actual malloc that exists at the start of these buffers. On x86, this
+ * is a five byte region. We could zero out the jump offset to be a jmp +0, but
+ * using nops can be faster. We specifically use a single five byte nop on x86
+ * as it is faster. When porting ptcumem to other architectures, the various
+ * opcode changes and options should be analyzed.
+ *
+ * 8.4 Interface with libc.so
+ * --------------------------
+ *
+ * The tmem_t structure as described in the beginning of section 8, is part of a
+ * private interface with libc. There are three functions that exist to cover
+ * this. They are not documented in man pages or header files. They are in the
+ * SUNWprivate part of libc's mapfile.
+ *
+ * o. _tmem_get_base(void)
+ *
+ * Returns the offset from the ulwp_t (curthread) to the tmem_t structure.
+ * This is a constant for all threads and is effectively a way to to do
+ * ::offsetof ulwp_t ul_tmem without having to know the specifics of the
+ * structure outside of libc.
+ *
+ * o. _tmem_get_nentries(void)
+ *
+ * Returns the number of roots that exist in the tmem_t. This is one part
+ * of the cap on the number of umem_caches that we can back with tmem.
+ *
+ * o. _tmem_set_cleanup(void (*)(void *, int))
+ *
+ * This sets a clean up handler that gets called back when a thread exits.
+ * There is one call per buffer, the void * is a pointer to the buffer on
+ * the list, the int is the index into the roots array for this buffer.
+ *
+ * 8.5 Tuning and disabling per-thread caching
+ * -------------------------------------------
+ *
+ * There is only one tunable for per-thread caching: the amount of memory each
+ * thread should be able to cache. This is specified via the perthread_cache
+ * UMEM_OPTION option. No attempt is made to to sanity check the specified
+ * value; the limit is simply the maximum value of a size_t.
+ *
+ * If the perthread_cache UMEM_OPTION is set to zero, nomagazines was requested,
+ * or UMEM_DEBUG has been turned on then we will never call into umem_genasm;
+ * however, the trampoline audit library and jump will still be in place.
+ *
+ * 8.6 Observing efficacy of per-thread caching
+ * --------------------------------------------
+ *
+ * To understand the efficacy of per-thread caching, use the ::umastat dcmd
+ * to see the percentage of capacity consumed on a per-thread basis, the
+ * degree to which each umem cache contributes to per-thread cache consumption,
+ * and the number of buffers in per-thread caches on a per-umem cache basis.
+ * If more detail is required, the specific buffers in a per-thread cache can
+ * be iterated over with the umem_ptc_* walkers. (These walkers allow an
+ * optional ulwp_t to be specified to iterate only over a particular thread's
+ * cache.)
*/
#include <umem_impl.h>
@@ -473,8 +706,10 @@ size_t umem_lite_minsize = 0; /* minimum buffer size for UMF_LITE */
size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */
size_t umem_maxverify; /* maximum bytes to inspect in debug routines */
size_t umem_minfirewall; /* hardware-enforced redzone threshold */
+size_t umem_ptc_size = 1048576; /* size of per-thread cache (in bytes) */
uint_t umem_flags = 0;
+uintptr_t umem_tmem_off;
mutex_t umem_init_lock; /* locks initialization */
cond_t umem_init_cv; /* initialization CV */
@@ -482,6 +717,8 @@ thread_t umem_init_thr; /* thread initializing */
int umem_init_env_ready; /* environ pre-initted */
int umem_ready = UMEM_READY_STARTUP;
+int umem_ptc_enabled; /* per-thread caching enabled */
+
static umem_nofail_callback_t *nofail_callback;
static mutex_t umem_nofail_exit_lock;
static thread_t umem_nofail_exit_thr;
@@ -2838,6 +3075,24 @@ umem_alloc_sizes_remove(size_t size)
umem_alloc_sizes[i] = 0;
}
+/*
+ * We've been called back from libc to indicate that thread is terminating and
+ * that it needs to release the per-thread memory that it has. We get to know
+ * which entry in the thread's tmem array the allocation came from. Currently
+ * this refers to first n umem_caches which makes this a pretty simple indexing
+ * job.
+ */
+static void
+umem_cache_tmem_cleanup(void *buf, int entry)
+{
+ size_t size;
+ umem_cache_t *cp;
+
+ size = umem_alloc_sizes[entry];
+ cp = umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT];
+ _umem_cache_free(cp, buf);
+}
+
static int
umem_cache_init(void)
{
@@ -2953,6 +3208,16 @@ umem_cache_init(void)
umem_alloc_caches[i] = cp;
}
+ umem_tmem_off = _tmem_get_base();
+ _tmem_set_cleanup(umem_cache_tmem_cleanup);
+
+ if (umem_genasm_supported && !(umem_flags & UMF_DEBUG) &&
+ !(umem_flags & UMF_NOMAGAZINE) &&
+ umem_ptc_size > 0) {
+ umem_ptc_enabled = umem_genasm(umem_alloc_sizes,
+ umem_alloc_caches, i) == 0 ? 1 : 0;
+ }
+
/*
* Initialization cannot fail at this point. Make the caches
* visible to umem_alloc() and friends.
diff --git a/usr/src/lib/libumem/common/umem_base.h b/usr/src/lib/libumem/common/umem_base.h
index e78bebfb58..c845331fbc 100644
--- a/usr/src/lib/libumem/common/umem_base.h
+++ b/usr/src/lib/libumem/common/umem_base.h
@@ -22,12 +22,13 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
#ifndef _UMEM_BASE_H
#define _UMEM_BASE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <umem_impl.h>
#ifdef __cplusplus
@@ -75,6 +76,8 @@ extern volatile uint32_t umem_reaping;
#define UMEM_REAP_ADDING 0x00000001 /* umem_reap() is active */
#define UMEM_REAP_ACTIVE 0x00000002 /* update thread is reaping */
+extern uintptr_t umem_tmem_off;
+
/*
* umem.c: tunables
*/
@@ -97,6 +100,7 @@ extern size_t umem_lite_minsize;
extern size_t umem_lite_maxalign;
extern size_t umem_maxverify;
extern size_t umem_minfirewall;
+extern size_t umem_ptc_size;
extern uint32_t umem_flags;
@@ -139,6 +143,20 @@ extern int umem_create_update_thread(void);
void umem_setup_envvars(int);
void umem_process_envvars(void);
+/*
+ * umem_genasm.c: private interfaces
+ */
+extern const int umem_genasm_supported;
+extern int umem_genasm(int *, umem_cache_t **, int);
+
+/*
+ * malloc.c: traditional malloc/free interface for genasm
+ */
+extern void *umem_malloc(size_t);
+extern void umem_malloc_free(void *);
+extern void *_malloc(size_t);
+extern void _free(void *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libumem/common/umem_impl.h b/usr/src/lib/libumem/common/umem_impl.h
index 84313c32ed..f63246e166 100644
--- a/usr/src/lib/libumem/common/umem_impl.h
+++ b/usr/src/lib/libumem/common/umem_impl.h
@@ -21,10 +21,13 @@
*/
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Copyright 2012 Joyent, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
+ */
+
#ifndef _UMEM_IMPL_H
#define _UMEM_IMPL_H
@@ -63,6 +66,7 @@ extern "C" {
#define UMF_HASH 0x00000200 /* cache has hash table */
#define UMF_RANDOMIZE 0x00000400 /* randomize other umem_flags */
+#define UMF_PTC 0x00000800 /* cache has per-thread caching */
#define UMF_BUFTAG (UMF_DEADBEEF | UMF_REDZONE)
#define UMF_TOUCH (UMF_BUFTAG | UMF_LITE | UMF_CONTENTS)
@@ -395,6 +399,13 @@ extern void umem_startup(caddr_t, size_t, size_t, caddr_t, caddr_t);
extern int umem_add(caddr_t, size_t);
#endif
+/*
+ * Private interface with libc for tcumem.
+ */
+extern uintptr_t _tmem_get_base(void);
+extern int _tmem_get_nentries(void);
+extern void _tmem_set_cleanup(void(*)(void *, int));
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libumem/i386/asm_subr.s b/usr/src/lib/libumem/i386/asm_subr.s
index 2edb2b49b5..5ad5345c6d 100644
--- a/usr/src/lib/libumem/i386/asm_subr.s
+++ b/usr/src/lib/libumem/i386/asm_subr.s
@@ -24,10 +24,32 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/asm_linkage.h>
+#define NOP4 \
+ nop; \
+ nop; \
+ nop; \
+ nop;
+
+#define NOP16 \
+ NOP4 \
+ NOP4 \
+ NOP4 \
+ NOP4
+
+#define NOP64 \
+ NOP16 \
+ NOP16 \
+ NOP16 \
+ NOP16
+
+#define NOP256 \
+ NOP64 \
+ NOP64 \
+ NOP64 \
+ NOP64
+
#if defined(lint)
void *
@@ -69,4 +91,25 @@ _breakpoint(void)
SET_SIZE(_breakpoint)
#endif
+ ENTRY(_malloc)
+ jmp umem_malloc;
+ NOP256
+ NOP256
+#if defined(__amd64)
+ NOP64
+#endif
+ SET_SIZE(_malloc)
+
+ ENTRY(_free)
+ jmp umem_malloc_free;
+ NOP256
+ NOP256
+#if defined(__amd64)
+ NOP64
+#endif
+ SET_SIZE(_free)
+
+ ANSI_PRAGMA_WEAK2(malloc,_malloc,function)
+ ANSI_PRAGMA_WEAK2(free,_free,function)
+
#endif /* lint */
diff --git a/usr/src/lib/libumem/i386/umem_genasm.c b/usr/src/lib/libumem/i386/umem_genasm.c
new file mode 100644
index 0000000000..530a83e486
--- /dev/null
+++ b/usr/src/lib/libumem/i386/umem_genasm.c
@@ -0,0 +1,595 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you find the blocks of assembly that follow confusing and
+ * you're questioning why they exist, please go read section 8 of the umem.c big
+ * theory statement. Next familiarize yourself with the malloc and free
+ * implementations in libumem's malloc.c.
+ *
+ * What follows is the i386 implementation of the thread caching automatic
+ * assembly generation. With i386 a function only has three registers it's
+ * allowed to change without restoring them: eax, ecx, and edx. All others have
+ * to be preserved. Since the set of registers we have available is so small, we
+ * have to make use of esi, ebx, and edi and save their original values to the
+ * stack.
+ *
+ * Malloc register usage:
+ * o. esi: Size of the malloc (passed into us and modified)
+ * o. edi: Size of the cache
+ * o. eax: Buffer to return
+ * o. ebx: Scratch space and temporary values
+ * o. ecx: Pointer to the tmem_t in the ulwp_t.
+ * o. edx: Pointer to the tmem_t array of roots
+ *
+ * Free register usage:
+ * o. esi: Size of the malloc (passed into us and modified)
+ * o. edi: Size of the cache
+ * o. eax: Buffer to free
+ * o. ebx: Scratch space and temporary values
+ * o. ecx: Pointer to the tmem_t in the ulwp_t.
+ * o. edx: Pointer to the tmem_t array of roots
+ *
+ * Once we determine what cache we are using, we increment %edx to the
+ * appropriate offset and set %edi with the size of the cache. This means that
+ * when we break out to the normal buffer allocation point %edx contains the
+ * head of the linked list and %edi is the amount that we have to adjust the
+ * total amount cached by the thread.
+ *
+ * Each block of assembly has psuedocode that describes its purpose.
+ */
+
+#include <inttypes.h>
+#include <strings.h>
+#include <umem_impl.h>
+#include "umem_base.h"
+
+#include <atomic.h>
+
+const int umem_genasm_supported = 1;
+static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
+static size_t umem_genasm_msize = 512;
+static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
+static size_t umem_genasm_fsize = 512;
+static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
+static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
+/*
+ * The maximum number of caches we can support. We use a single byte addl so
+ * this is 255 (UINT8_MAX) / sizeof (uintptr_t). In this case 63
+ */
+#define UMEM_GENASM_MAX32 63
+
+#define PTC_JMPADDR(dest, src) (dest - (src + 4))
+#define PTC_ROOT_SIZE sizeof (uintptr_t)
+#define MULTINOP 0x0000441f0f
+
+/*
+ * void *ptcmalloc(size_t orig_size);
+ *
+ * size_t size = orig_size + 8;
+ *
+ * if (size < orig_size)
+ * goto tomalloc; ! This is overflow
+ *
+ * if (size > cache_size)
+ * goto tomalloc;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define PTC_MALINIT_JOUT 0x0e
+#define PTC_MALINIT_MCS 0x14
+#define PTC_MALINIT_JOV 0x1a
+#define PTC_MALINIT_SOFF 0x27
+static const uint8_t malinit[] = {
+ 0x55, /* pushl %ebp */
+ 0x89, 0xe5, /* movl %esp, %ebp */
+ 0x57, /* pushl %edi */
+ 0x56, /* pushl %esi */
+ 0x53, /* pushl %ebx */
+ 0x8b, 0x75, 0x08, /* movl 0x8(%ebp), %esi */
+ 0x83, 0xc6, 0x08, /* addl $0x8,%esi */
+ 0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, /* jc +$JMP (errout) */
+ 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */
+ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +$JMP (errout) */
+ 0x65, 0x8b, 0x0d, 0x00, 0x00, 0x00, 0x00, /* movl %gs:0x0,%ecx */
+ 0x81, 0xc1, 0x00, 0x00, 0x00, 0x00, /* addl $OFF, %ecx */
+ 0x8d, 0x51, 0x04 /* leal 0x4(%ecx), %edx */
+};
+
+/*
+ * void ptcfree(void *buf);
+ *
+ * if (buf == NULL)
+ * return;
+ *
+ * malloc_data_t *tag = buf;
+ * tag--;
+ * int size = tag->malloc_size;
+ * int tagtval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
+ *
+ * if (tagval != MALLOC_MAGIC)
+ * goto tofree;
+ *
+ * if (size > cache_max)
+ * goto tofree;
+ *
+ * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
+ * void **roots = t->tm_roots;
+ */
+#define PTC_FRINI_JDONE 0x0d
+#define PTC_FRINI_JFREE 0x23
+#define PTC_FRINI_MCS 0x29
+#define PTC_FRINI_JOV 0x2f
+#define PTC_FRINI_SOFF 0x3c
+static const uint8_t freeinit[] = {
+ 0x55, /* pushl %ebp */
+ 0x89, 0xe5, /* movl %esp, %ebp */
+ 0x57, /* pushl %edi */
+ 0x56, /* pushl %esi */
+ 0x53, /* pushl %ebx */
+ 0x8b, 0x45, 0x08, /* movl 0x8(%ebp), %eax */
+ 0x85, 0xc0, /* testl %eax, %eax */
+ 0x0f, 0x84, 0x00, 0x00, 0x00, 0x00, /* je $JDONE (done) */
+ 0x83, 0xe8, 0x08, /* subl $0x8,%eax */
+ 0x8b, 0x30, /* movl (%eax),%esi */
+ 0x8b, 0x50, 0x04, /* movl 0x4(%eax),%edx */
+ 0x01, 0xf2, /* addl %esi,%edx */
+ 0x81, 0xfa, 0x00, 0xc0, 0x10, 0x3a, /* cmpl MAGIC32, %edx */
+ 0x0f, 0x85, 0x00, 0x00, 0x00, 0x00, /* jne +JFREE (goto freebuf) */
+
+ 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */
+ 0x0f, 0x87, 0x00, 0x00, 0x00, 0x00, /* ja +$JMP (errout) */
+ 0x65, 0x8b, 0x0d, 0x00, 0x0, 0x00, 0x00, /* movl %gs:0x0,%ecx */
+ 0x81, 0xc1, 0x00, 0x00, 0x00, 0x00, /* addl $0xOFF, %ecx */
+ 0x8d, 0x51, 0x04 /* leal 0x4(%ecx),%edx */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * } else ... ! goto next cache
+ */
+#define PTC_INICACHE_CMP 0x02
+#define PTC_INICACHE_SIZE 0x09
+#define PTC_INICACHE_JMP 0x0e
+static const uint8_t inicache[] = {
+ 0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, /* cmpl sizeof ($C0), %esi */
+ 0x77, 0x0a, /* ja +0xa */
+ 0xbf, 0xff, 0x00, 0x00, 0x00, /* movl sizeof ($C0), %edi */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf) */
+};
+
+/*
+ * if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * roots += $CACHE_NUM;
+ * } else ... ! goto next cache
+ */
+#define PTC_GENCACHE_CMP 0x02
+#define PTC_GENCACHE_NUM 0x0a
+#define PTC_GENCACHE_SIZE 0x0c
+#define PTC_GENCACHE_JMP 0x11
+static const uint8_t gencache[] = {
+ 0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, /* cmpl sizeof ($CACHE), %esi */
+ 0x77, 0x0d, /* ja +0xd (next cache) */
+ 0x83, 0xc2, 0x00, /* addl $4*$ii, %edx */
+ 0xbf, 0x00, 0x00, 0x00, 0x00, /* movl sizeof ($CACHE), %edi */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp +$JMP (allocbuf) */
+};
+
+/*
+ * else if (size <= $CACHE_SIZE) {
+ * csize = $CACHE_SIZE;
+ * roots += $CACHE_NUM;
+ * } else {
+ * goto tofunc; ! goto tomalloc if ptcmalloc.
+ * } ! goto tofree if ptcfree.
+ */
+#define PTC_FINCACHE_CMP 0x02
+#define PTC_FINCACHE_JMP 0x07
+#define PTC_FINCACHE_NUM 0x0a
+#define PTC_FINCACHE_SIZE 0x0c
+static const uint8_t fincache[] = {
+ 0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, /* cmpl sizeof ($CLAST), %esi */
+ 0x77, 0x00, /* ja +$JMP (to errout) */
+ 0x83, 0xc2, 0x00, /* addl $4*($NCACHES-1), %edx */
+ 0xbf, 0x00, 0x00, 0x00, 0x00, /* movl sizeof ($CLAST), %edi */
+};
+
+/*
+ * if (*root == NULL)
+ * goto tomalloc;
+ *
+ * malloc_data_t *ret = *root;
+ * *root = *(void **)ret;
+ * t->tm_size += csize;
+ * ret->malloc_size = size;
+ *
+ * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
+ * ret++;
+ *
+ * return ((void *)ret);
+ * tomalloc:
+ * return (malloc(orig_size));
+ */
+#define PTC_MALFINI_ALLABEL 0x00
+#define PTC_MALFINI_JMLABEL 0x20
+#define PTC_MALFINI_JMADDR 0x25
+static const uint8_t malfini[] = {
+ /* allocbuf: */
+ 0x8b, 0x02, /* movl (%edx), %eax */
+ 0x85, 0xc0, /* testl %eax, %eax */
+ 0x74, 0x1a, /* je +0x1a (errout) */
+ 0x8b, 0x18, /* movl (%eax), %esi */
+ 0x89, 0x1a, /* movl %esi, (%edx) */
+ 0x29, 0x39, /* subl %edi, (%ecx) */
+ 0x89, 0x30, /* movl %esi, ($eax) */
+ 0xba, 0x00, 0xc0, 0x10, 0x3a, /* movl $0x3a10c000,%edx */
+ 0x29, 0xf2, /* subl %esi, %edx */
+ 0x89, 0x50, 0x04, /* movl %edx, 0x4(%eax) */
+ 0x83, 0xc0, 0x08, /* addl %0x8, %eax */
+ 0x5b, /* popl %ebx */
+ 0x5e, /* popl %esi */
+ 0x5f, /* popl %edi */
+ 0xc9, /* leave */
+ 0xc3, /* ret */
+ /* errout: */
+ 0x5b, /* popl %ebx */
+ 0x5e, /* popl %esi */
+ 0x5f, /* popl %edi */
+ 0xc9, /* leave */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp $malloc */
+};
+
+/*
+ * if (t->tm_size + csize > umem_ptc_size)
+ * goto tofree;
+ *
+ * t->tm_size += csize
+ * *(void **)tag = *root;
+ * *root = tag;
+ * return;
+ * tofree:
+ * free(buf);
+ * return;
+ */
+#define PTC_FRFINI_RBUFLABEL 0x00
+#define PTC_FRFINI_CACHEMAX 0x06
+#define PTC_FRFINI_DONELABEL 0x14
+#define PTC_FRFINI_JFLABEL 0x19
+#define PTC_FRFINI_JFADDR 0x1e
+static const uint8_t freefini[] = {
+ /* freebuf: */
+ 0x8b, 0x19, /* movl (%ecx),%ebx */
+ 0x01, 0xfb, /* addl %edi,%ebx */
+ 0x81, 0xfb, 0x00, 0x00, 0x00, 0x00, /* cmpl maxsize, %ebx */
+ 0x73, 0x0d, /* jae +0xd <tofree> */
+ 0x01, 0x39, /* addl %edi,(%ecx) */
+ 0x8b, 0x3a, /* movl (%edx),%edi */
+ 0x89, 0x38, /* movl %edi,(%eax) */
+ 0x89, 0x02, /* movl %eax,(%edx) */
+ /* done: */
+ 0x5b, /* popl %ebx */
+ 0x5e, /* popl %esi */
+ 0x5f, /* popl %edi */
+ 0xc9, /* leave */
+ 0xc3, /* ret */
+ /* realfree: */
+ 0x5b, /* popl %ebx */
+ 0x5e, /* popl %esi */
+ 0x5f, /* popl %edi */
+ 0xc9, /* leave */
+ 0xe9, 0x00, 0x00, 0x00, 0x00 /* jmp free */
+};
+
+/*
+ * Construct the initial part of malloc. off contains the offset from curthread
+ * to the root of the tmem structure. ep is the address of the label to error
+ * and jump to free. csize is the size of the largest umem_cache in ptcumem.
+ */
+static int
+genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
+{
+ uint32_t addr;
+
+ bcopy(malinit, bp, sizeof (malinit));
+ addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
+ bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
+ bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
+ addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
+ bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
+ bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
+
+ return (sizeof (malinit));
+}
+
+static int
+genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mc)
+{
+ uint32_t addr;
+
+ bcopy(freeinit, bp, sizeof (freeinit));
+ addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
+ bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
+ addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
+ bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
+ bcopy(&mc, bp + PTC_FRINI_MCS, sizeof (mc));
+ addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
+ bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
+ bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
+ return (sizeof (freeinit));
+}
+
+/*
+ * Create the initial cache entry of the specified size. The value of ap tells
+ * us what the address of the label to try and allocate a buffer. This value is
+ * an offset from the current base to that value.
+ */
+static int
+genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
+{
+ uint32_t addr;
+
+ bcopy(inicache, bp, sizeof (inicache));
+ bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
+ addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
+ ASSERT(addr != 0);
+ bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
+
+ return (sizeof (inicache));
+}
+
+static int
+genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
+{
+ uint32_t addr;
+ uint8_t coff;
+
+ ASSERT(256 / PTC_ROOT_SIZE > num);
+ ASSERT(num != 0);
+ bcopy(gencache, bp, sizeof (gencache));
+ bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
+ coff = num * PTC_ROOT_SIZE;
+ bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
+ addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
+ bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
+
+ return (sizeof (gencache));
+}
+
+static int
+genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
+{
+ uint8_t addr;
+
+ ASSERT(ep <= 0xff && ep > 7);
+ ASSERT(256 / PTC_ROOT_SIZE > num);
+ bcopy(fincache, bp, sizeof (fincache));
+ bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
+ bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
+ addr = num * PTC_ROOT_SIZE;
+ bcopy(&addr, bp + PTC_FINCACHE_NUM, sizeof (addr));
+ addr = ep - PTC_FINCACHE_JMP - 1;
+ bcopy(&addr, bp + PTC_FINCACHE_JMP, sizeof (addr));
+
+ return (sizeof (fincache));
+}
+
+static int
+genasm_malfini(uint8_t *bp, uintptr_t mptr)
+{
+ uint32_t addr;
+
+ bcopy(malfini, bp, sizeof (malfini));
+ addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
+ bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
+
+ return (sizeof (malfini));
+}
+
+static int
+genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
+{
+ uint32_t addr;
+
+ bcopy(freefini, bp, sizeof (freefini));
+ bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
+ addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
+ bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
+
+ return (sizeof (freefini));
+}
+
+/*
+ * The malloc inline assembly is constructed as follows:
+ *
+ * o Malloc prologue assembly
+ * o Generic first-cache check
+ * o n Generic cache checks (where n = _tmem_get_entries() - 2)
+ * o Generic last-cache check
+ * o Malloc epilogue assembly
+ *
+ * Generally there are at least three caches. When there is only one cache we
+ * only use the generic last-cache. In the case where there are two caches, we
+ * just leave out the middle ones.
+ */
+static int
+genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+ int ii, off;
+ uint8_t *bp;
+ size_t total;
+ uint32_t allocoff, erroff;
+
+ total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
+
+ if (nents >= 2)
+ total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+ if (total > len)
+ return (1);
+
+ erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
+ allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
+
+ bp = base;
+
+ off = genasm_malinit(bp, umem_tmem_off, erroff,
+ umem_alloc_sizes[nents-1]);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+
+ if (nents > 1) {
+ off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+ }
+
+ for (ii = 1; ii < nents - 1; ii++) {
+ off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
+ bp += off;
+ allocoff -= off;
+ erroff -= off;
+ }
+
+ bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+ erroff);
+ bp += genasm_malfini(bp, umem_genasm_omptr);
+ ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+ return (0);
+}
+
+static int
+genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
+{
+ uint8_t *bp;
+ int ii, off;
+ size_t total;
+ uint32_t rbufoff, retoff, erroff;
+
+ /* Assume that nents has already been audited for us */
+ total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
+ if (nents >= 2)
+ total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
+
+ if (total > len)
+ return (1);
+
+ erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
+ rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
+ retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
+
+ bp = base;
+
+ off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
+ umem_alloc_sizes[nents - 1]);
+ bp += off;
+ erroff -= off;
+ rbufoff -= off;
+
+ if (nents > 1) {
+ off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
+ bp += off;
+ erroff -= off;
+ rbufoff -= off;
+ }
+
+ for (ii = 1; ii < nents - 1; ii++) {
+ off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
+ bp += off;
+ rbufoff -= off;
+ erroff -= off;
+ }
+
+ bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
+ erroff);
+ bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
+ ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
+
+ return (0);
+}
+
+int
+umem_genasm(int *alloc_sizes, umem_cache_t **caches, int ncaches)
+{
+ int nents, i;
+ uint8_t *mptr;
+ uint8_t *fptr;
+ uint64_t v, *vptr;
+
+ mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
+ fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
+ if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
+ umem_genasm_fptr == 0 || umem_genasm_fsize == 0)
+ return (1);
+
+ /*
+ * The total number of caches that we can service is the minimum of:
+ * o the amount supported by libc
+ * o the total number of umem caches
+ * o we use a single byte addl, so it's 255 / sizeof (uintptr_t). For
+ * 32-bit, this is 63.
+ */
+ nents = _tmem_get_nentries();
+
+ if (UMEM_GENASM_MAX32 < nents)
+ nents = UMEM_GENASM_MAX32;
+
+ if (ncaches < nents)
+ nents = ncaches;
+
+ /* Based on our constraints, this is not an error */
+ if (nents == 0 || umem_ptc_size == 0)
+ return (0);
+
+ /* Take into account the jump */
+ if (genasm_malloc(mptr, umem_genasm_msize, nents,
+ alloc_sizes) != 0)
+ return (1);
+
+ if (genasm_free(fptr, umem_genasm_fsize, nents,
+ alloc_sizes) != 0)
+ return (1);
+
+ /* nop out the jump with a multibyte jump */
+ vptr = (void *)umem_genasm_mptr;
+ v = MULTINOP;
+ v |= *vptr & (0xffffffULL << 40);
+ (void) atomic_swap_64(vptr, v);
+ vptr = (void *)umem_genasm_fptr;
+ v = MULTINOP;
+ v |= *vptr & (0xffffffULL << 40);
+ (void) atomic_swap_64(vptr, v);
+
+ for (i = 0; i < nents; i++)
+ caches[i]->cache_flags |= UMF_PTC;
+
+ return (0);
+}
diff --git a/usr/src/lib/libumem/sparc/umem_genasm.c b/usr/src/lib/libumem/sparc/umem_genasm.c
new file mode 100644
index 0000000000..4bdea8122d
--- /dev/null
+++ b/usr/src/lib/libumem/sparc/umem_genasm.c
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Don't Panic! If you wonder why this seemingly empty file exists, it's because
+ * there is no sparc implementation for ptcumem. Go read libumem's big theory
+ * statement in lib/libumem/common/umem.c, particularly section eight.
+ */
+
+const int umem_genasm_supported = 0;
+
+/*ARGSUSED*/
+int
+umem_genasm(int *cp, int nc)
+{
+ return (1);
+}
diff --git a/usr/src/man/man3malloc/umem_alloc.3malloc b/usr/src/man/man3malloc/umem_alloc.3malloc
index cc8e3df369..d8680ca083 100644
--- a/usr/src/man/man3malloc/umem_alloc.3malloc
+++ b/usr/src/man/man3malloc/umem_alloc.3malloc
@@ -174,6 +174,19 @@ Set the underlying function used to allocate memory. This option can be set to
\fBmmap\fR(2)-based source. If set to a value that is not supported, \fBsbrk\fR
will be used.
.RE
+.sp
+.ne 2
+.na
+\fB\fBperthread_cache\fR=\fBsize\fR\fR
+.ad
+.RS 16n
+libumem allows for each thread to cache recently freed small allocations for
+future allocations. The size argument, which accepts k, m, g, and t, suffixes
+denotes the maximum amount of memory each thread can use for this purpose. The
+default amount used is 1 MB. Any buffers in the per-thread cache are freed when
+the thread exits. The efficacy of the per-thread cache can be determined with
+the \fB::umastat\fR \fBmdb\fR(1) \fIdcmd\fR debugger command.
+.RE
.ne 2
.na