summaryrefslogtreecommitdiff
path: root/usr/src/lib/libumem/common
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libumem/common')
-rw-r--r--usr/src/lib/libumem/common/envvar.c9
-rw-r--r--usr/src/lib/libumem/common/mapfile-vers5
-rw-r--r--usr/src/lib/libumem/common/stub_stand.c29
-rw-r--r--usr/src/lib/libumem/common/umem.c302
-rw-r--r--usr/src/lib/libumem/common/umem_base.h14
-rw-r--r--usr/src/lib/libumem/common/umem_impl.h16
6 files changed, 366 insertions, 9 deletions
diff --git a/usr/src/lib/libumem/common/envvar.c b/usr/src/lib/libumem/common/envvar.c
index fc3d490a01..2fdf7824c3 100644
--- a/usr/src/lib/libumem/common/envvar.c
+++ b/usr/src/lib/libumem/common/envvar.c
@@ -25,6 +25,10 @@
* Copyright 2012 Joyent, Inc. All rights reserved.
*/
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
+ */
+
#include <ctype.h>
#include <errno.h>
#include <limits.h>
@@ -151,7 +155,10 @@ static umem_env_item_t umem_options_items[] = {
NULL, 0, NULL, &vmem_sbrk_pagesize
},
#endif
-
+ { "perthread_cache", "Evolving", ITEM_SIZE,
+ "Size (in bytes) of per-thread allocation cache",
+ NULL, 0, NULL, &umem_ptc_size
+ },
{ NULL, "-- end of UMEM_OPTIONS --", ITEM_INVALID }
};
diff --git a/usr/src/lib/libumem/common/mapfile-vers b/usr/src/lib/libumem/common/mapfile-vers
index 102bd989f7..6a05f0cfaa 100644
--- a/usr/src/lib/libumem/common/mapfile-vers
+++ b/usr/src/lib/libumem/common/mapfile-vers
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012, Joyent, Inc. All rights reserved.
#
#
@@ -72,6 +73,10 @@ SYMBOL_VERSION SUNWprivate_1.1 {
vmem_walk;
vmem_xalloc;
vmem_xfree;
+ umem_genasm_mptr;
+ umem_genasm_fptr;
+ umem_genasm_fsize;
+ umem_genasm_msize;
local:
*;
};
diff --git a/usr/src/lib/libumem/common/stub_stand.c b/usr/src/lib/libumem/common/stub_stand.c
index 54635558c3..1a90c6be75 100644
--- a/usr/src/lib/libumem/common/stub_stand.c
+++ b/usr/src/lib/libumem/common/stub_stand.c
@@ -23,6 +23,9 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
/*
* Stubs for the standalone to reduce the dependence on external libraries
@@ -125,3 +128,29 @@ issetugid(void)
{
return (1);
}
+
+int
+_tmem_get_nentries(void)
+{
+ return (0);
+}
+
+uintptr_t
+_tmem_get_base(void)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+_tmem_set_cleanup(void (*f)(int, void *))
+{
+}
+
+uint64_t
+atomic_swap_64(volatile uint64_t *t, uint64_t v)
+{
+ uint64_t old = *t;
+ *t = v;
+ return (old);
+}
diff --git a/usr/src/lib/libumem/common/umem.c b/usr/src/lib/libumem/common/umem.c
index a3eb0b8e6c..e22106e979 100644
--- a/usr/src/lib/libumem/common/umem.c
+++ b/usr/src/lib/libumem/common/umem.c
@@ -24,7 +24,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
+ */
/*
* based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
@@ -44,7 +46,7 @@
*
* 1. Overview
* -----------
- * umem is very close to kmem in implementation. There are four major
+ * umem is very close to kmem in implementation. There are seven major
* areas of divergence:
*
* * Initialization
@@ -57,6 +59,10 @@
*
* * lock ordering
*
+ * * changing UMEM_MAXBUF
+ *
+ * * Per-thread caching for malloc/free
+ *
* 2. Initialization
* -----------------
* kmem is initialized early on in boot, and knows that no one will call
@@ -355,6 +361,248 @@
* umem_log_header_t's:
* lh_cpu[*].clh_lock
* lh_lock
+ *
+ * 7. Changing UMEM_MAXBUF
+ * -----------------------
+ *
+ * When changing UMEM_MAXBUF extra care has to be taken. It is not sufficient to
+ * simply increase this number. First, one must update the umem_alloc_table to
+ * have the appropriate number of entires based upon the new size. If this is
+ * not done, this will lead to libumem blowing an assertion.
+ *
+ * The second place to update, which is not required, is the umem_alloc_sizes.
+ * These determine the default cache sizes that we're going to support.
+ *
+ * 8. Per-thread caching for malloc/free
+ * -------------------------------------
+ *
+ * "Time is an illusion. Lunchtime doubly so." -- Douglas Adams
+ *
+ * Time may be an illusion, but CPU cycles aren't. While libumem is designed
+ * to be a highly scalable allocator, that scalability comes with a fixed cycle
+ * penalty even in the absence of contention: libumem must acquire (and release
+ * a per-CPU lock for each allocation. When contention is low and malloc(3C)
+ * frequency is high, this overhead can dominate execution time. To alleviate
+ * this, we allow for per-thread caching, a lock-free means of caching recent
+ * deallocations on a per-thread basis for use in satisfying subsequent calls
+ *
+ * In addition to improving performance, we also want to:
+ * * Minimize fragmentation
+ * * Not add additional memory overhead (no larger malloc tags)
+ *
+ * In the ulwp_t of each thread there is a private data structure called a
+ * umem_t that looks like:
+ *
+ * typedef struct {
+ * size_t tm_size;
+ * void *tm_roots[NTMEMBASE]; (Currently 16)
+ * } tmem_t;
+ *
+ * Each of the roots is treated as the head of a linked list. Each entry in the
+ * list can be thought of as a void ** which points to the next entry, until one
+ * of them points to NULL. If the head points to NULL, the list is empty.
+ *
+ * Each head corresponds to a umem_cache. Currently there is a linear mapping
+ * where the first root corresponds to the first cache, second root to the
+ * second cache, etc. This works because every allocation that malloc makes to
+ * umem_alloc that can be satisified by a umem_cache will actually return a
+ * number of bytes equal to the size of that cache. Because of this property and
+ * a one to one mapping between caches and roots we can guarantee that every
+ * entry in a given root's list will be able to satisfy the same requests as the
+ * corresponding cache.
+ *
+ * The maximum amount of memory that can be cached in each thread is determined
+ * by the perthread_cache UMEM_OPTION. It corresponds to the umem_ptc_size
+ * value. The default value for this is currently 1 MB. Once umem_init() has
+ * finished this cannot be directly tuned without directly modifying the
+ * instruction text. If, upon calling free(3C), the amount cached would exceed
+ * this maximum, we instead actually return the buffer to the umem_cache instead
+ * of holding onto it in the thread.
+ *
+ * When a thread calls malloc(3C) it first determines which umem_cache it
+ * would be serviced by. If the allocation is not covered by ptcumem it goes to
+ * the normal malloc instead. Next, it checks if the tmem_root's list is empty
+ * or not. If it is empty, we instead go and allocate the memory from
+ * umem_alloc. If it is not empty, we remove the head of the list, set the
+ * appropriate malloc tags, and return that buffer.
+ *
+ * When a thread calls free(3C) it first looks at the malloc tag and if it is
+ * invalid or the allocation exceeds the largest cache in ptcumem and sends it
+ * off to the original free() to handle and clean up appropriately. Next, it
+ * checks if the allocation size is covered by one of the per-thread roots and
+ * if it isn't, it passes it off to the original free() to be released. Finally,
+ * before it inserts this buffer as the head, it checks if adding this buffer
+ * would put the thread over its maximum cache size. If it would, it frees the
+ * buffer back to the umem_cache. Otherwise it increments the threads total
+ * cached amount and makes the buffer the new head of the appropriate tm_root.
+ *
+ * When a thread exits, all of the buffers that it has in its per-thread cache
+ * will be passed to umem_free() and returned to the appropriate umem_cache.
+ *
+ * 8.1 Handling addition and removal of umem_caches
+ * ------------------------------------------------
+ *
+ * The set of umem_caches that are used to back calls to umem_alloc() and
+ * ultimately malloc() are determined at program execution time. The default set
+ * of caches is defined below in umem_alloc_sizes[]. Various umem_options exist
+ * that modify the set of caches: size_add, size_clear, and size_remove. Because
+ * the set of caches can only be determined once umem_init() has been called and
+ * we have the additional goals of minimizing additional fragmentation and
+ * metadata space overhead in the malloc tags, this forces our hand to go down a
+ * slightly different path: the one tread by fasttrap and trapstat.
+ *
+ * During umem_init we're going to dynamically construct a new version of
+ * malloc(3C) and free(3C) that utilizes the known cache sizes and then ensure
+ * that ptcmalloc and ptcfree replace malloc and free as entries in the plt. If
+ * ptcmalloc and ptcfree cannot handle a request, they simply jump to the
+ * original libumem implementations.
+ *
+ * After creating all of the umem_caches, but before making them visible,
+ * umem_cache_init checks that umem_genasm_supported is non-zero. This value is
+ * set by each architecture in $ARCH/umem_genasm.c to indicate whether or not
+ * they support this. If the value is zero, then this process is skipped.
+ * Similarly, if the cache size has been tuned to zero by UMEM_OPTIONS, then
+ * this is also skipped.
+ *
+ * In umem_genasm.c, each architecture's implementation implements a single
+ * function called umem_genasm() that is responsible for generating the
+ * appropriate versions of ptcmalloc() and ptcfree(), placing them in the
+ * appropriate memory location, and finally doing the switch from malloc() and
+ * free() to ptcmalloc() and ptcfree(). Once the change has been made, there is
+ * no way to switch back, short of restarting the program or modifying program
+ * text with mdb.
+ *
+ * 8.2 Modifying the Procedure Linkage Table (PLT)
+ * -----------------------------------------------
+ *
+ * The last piece of this puzzle is how we actually jam ptcmalloc() into the
+ * PLT. The dyanmic linker has support for global and local audit libraries.
+ * For the full explanation of audit libraries consult the Linkers and Libraries
+ * guide or the linker source. A local auditer can attach to a single library
+ * and interpose on all of the relocations that come in from and leave to that
+ * same library. To facilitate our work, we have created a local audit library
+ * for libumem that is called libumem_trampoline and is located in
+ * lib/libumem_trampoline/.
+ *
+ * When any resolution is done to malloc(), the audit library allows us to
+ * replace the address with an address that it specifies. There are two 4k
+ * sections in the libumem_trampoline's bss which we use as the stomping grounds
+ * for ptcmalloc and ptcfree. When the audit library audits the malloc and free
+ * functions from libumem, it encodes their address and sets its buffers to
+ * contain a simple trampoline which consists of a jmp instruction and a four
+ * byte offset to the original malloc and free. libumem_trampoline's mapfile
+ * explicitly makes its bss rwx instead of rw to support this.
+ *
+ * When umem_genasm() is called, it uses a similar mechanism to get the address
+ * and size of the trampoline libraries malloc (mbuf) and free (fbuf) buffers.
+ * After validating that the size will be able to contain all of the
+ * instructions, it starts laying out ptcmalloc and ptcfree at mbuf[4] and
+ * fbuf[4]. Once both have been successfully generated, umem_genasm() stores a
+ * single five byte nop over the original jump.
+ *
+ * 8.3 umem_genasm()
+ * -----------------
+ *
+ * umem_genasm() is currently implemented for i386 and amd64. This section
+ * describes the theory behind the construction. For specific byte code to
+ * assembly instructions and niceish C and asm versions of ptcmalloc and
+ * ptcfree, see the individual umem_genasm.c files. The layout consists of the
+ * following sections:
+ *
+ * o. function-specfic prologue
+ * o. function-generic cache-selecting elements
+ * o. function-specific epilogue
+ *
+ * There are three different generic cache elements that exist:
+ *
+ * o. the last or only cache
+ * o. the intermediary caches if more than two
+ * o. the first one if more than one cache
+ *
+ * The malloc and free prologues and epilogues mimic the necessary portions of
+ * libumem's malloc and free. This includes things like checking for size
+ * overflow, setting and verifying the malloc tags.
+ *
+ * It is an important constraint that these functions do not make use of the
+ * call instruction. The only jmp outside of the individual functions is to the
+ * original libumem malloc and free respectively. Because doing things like
+ * setting errno or raising an internal umem error on improper malloc tags would
+ * require using calls into the PLT, whenever we encounter one of those cases we
+ * just jump to the original malloc and free functions reusing the same stack
+ * frame.
+ *
+ * Each of the above sections, the three caches, and the malloc and free
+ * prologue and epilogue are implemented as blocks of machine code with the
+ * corresponding assembly in comments. There are known offsets into each block
+ * that corresponds to locations of data and addresses that we only know at run
+ * time. These blocks are copied as necessary and the blanks filled in
+ * appropriately.
+ *
+ * As mentioned in section 8.2, the trampoline library uses specifically named
+ * variables to communicate the buffers and size to use. These variables are:
+ *
+ * o. umem_genasm_mptr: The buffer for ptcmalloc
+ * o. umem_genasm_msize: The size in bytes of the above buffer
+ * o. umem_genasm_fptr: The buffer for ptcfree
+ * o. umem_genasm_fsize: The size in bytes of the above buffer
+ *
+ * Finally, to enable the generated assembly we need to remove the previous jump
+ * to the actual malloc that exists at the start of these buffers. This is a
+ * five byte region. We could zero out the jump offset to be a jmp +0, but
+ * using nops can be faster. We specifically use a single five byte nop which is
+ * faster. The opcode for the five byte nop is 0x 0f 1f 44 00 00. On x86,
+ * remember integers are little endian, so it will be written the other way
+ * around.
+ *
+ * 8.4 Interface with libc.so
+ * --------------------------
+ *
+ * The tmem_t structure as described in the beginning of section 8, is part of a
+ * private interface with libc. There are three functions that exist to cover
+ * this. They are not documented in man pages or header files. They are in the
+ * SUNWprivate part of libc's makefile.
+ *
+ * o. _tmem_get_base(void)
+ *
+ * Returns the offset from the ulwp_t (curthread) to the tmem_t structure.
+ * This is a constant for all threads and is effectively a way to to do
+ * ::offsetof ulwp_t ul_tmem without having to know the specifics of the
+ * structure outside of libc.
+ *
+ * o. _tmem_get_nentries(void)
+ *
+ * Returns the number of roots that exist in the tmem_t. This is one part
+ * of the cap on the number of umem_caches that we can back with tmem.
+ *
+ * o. _tmem_set_cleanup(void (*)(void *, int))
+ *
+ * This sets a clean up handler that gets called back when a thread exits.
+ * There is one call per buffer, the void * is a pointer to the buffer on
+ * the list, the int is the index into the roots array for this buffer.
+ *
+ * 8.5 Tuning and disabling per-thread caching
+ * -------------------------------------------
+ *
+ * There is only one tunable for per-thread caching: the amount of memory each
+ * thread should be able to cache. This is specified via the perthread_cache
+ * UMEM_OPTION option. No attempt is made to to sanity check the specified
+ * value; the limit is simply the maximum value of a size_t.
+ *
+ * If the perthread_cache UMEM_OPTION is set to zero, nomagazines was requested,
+ * or UMEM_DEBUG has been turned on then we will never call into umem_genasm;
+ * however, the trampoline audit library and jump will still be in place.
+ *
+ * 8.6 Observing efficacy of per-thread caching
+ * --------------------------------------------
+ *
+ * To understand the efficacy of per-thread caching, use the ::umastat dcmd
+ * to see the percentage of capacity consumed on a per-thread basis, the
+ * degree to which each umem cache contributes to per-thread cache consumption,
+ * and the number of buffers in per-thread caches on a per-umem cache basis.
+ * If more detail is required, the specific buffers in a per-thread cache can
+ * be iterated over with the umem_ptc_* walkers. (These walkers allow an
+ * optional ulwp_t to be specified to iterate only over a particular thread's
+ * cache.)
*/
#include <umem_impl.h>
@@ -420,7 +668,9 @@ static int umem_alloc_sizes[] = {
P2ALIGN(8192 / 2, 64), 4544,
P2ALIGN(8192 / 1, 64), 9216,
4096 * 3,
- UMEM_MAXBUF, /* = 8192 * 2 */
+ 8192 * 2, /* = 8192 * 2 */
+ 24576, 32768, 40960, 49152, 57344, 65536, 73728, 81920,
+ 90112, 98304, 106496, 114688, 122880, UMEM_MAXBUF, /* 128k */
/* 24 slots for user expansion */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
@@ -461,8 +711,10 @@ size_t umem_lite_minsize = 0; /* minimum buffer size for UMF_LITE */
size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */
size_t umem_maxverify; /* maximum bytes to inspect in debug routines */
size_t umem_minfirewall; /* hardware-enforced redzone threshold */
+size_t umem_ptc_size = 1048576; /* size of per-thread cache (in bytes) */
uint_t umem_flags = 0;
+uintptr_t umem_tmem_off;
mutex_t umem_init_lock; /* locks initialization */
cond_t umem_init_cv; /* initialization CV */
@@ -470,6 +722,8 @@ thread_t umem_init_thr; /* thread initializing */
int umem_init_env_ready; /* environ pre-initted */
int umem_ready = UMEM_READY_STARTUP;
+int umem_ptc_enabled; /* per-thread caching enabled */
+
static umem_nofail_callback_t *nofail_callback;
static mutex_t umem_nofail_exit_lock;
static thread_t umem_nofail_exit_thr;
@@ -592,6 +846,20 @@ umem_cache_t umem_null_cache = {
static umem_cache_t *umem_alloc_table[UMEM_MAXBUF >> UMEM_ALIGN_SHIFT] = {
ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
+ ALLOC_TABLE_1024,
ALLOC_TABLE_1024
};
@@ -2812,6 +3080,24 @@ umem_alloc_sizes_remove(size_t size)
umem_alloc_sizes[i] = 0;
}
+/*
+ * We've been called back from libc to indicate that thread is terminating and
+ * that it needs to release the per-thread memory that it has. We get to know
+ * which entry in the thread's tmem array the allocation came from. Currently
+ * this refers to first n umem_caches which makes this a pretty simple indexing
+ * job.
+ */
+static void
+umem_cache_tmem_cleanup(void *buf, int entry)
+{
+ size_t size;
+ umem_cache_t *cp;
+
+ size = umem_alloc_sizes[entry];
+ cp = umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT];
+ _umem_cache_free(cp, buf);
+}
+
static int
umem_cache_init(void)
{
@@ -2927,6 +3213,16 @@ umem_cache_init(void)
umem_alloc_caches[i] = cp;
}
+ umem_tmem_off = _tmem_get_base();
+ _tmem_set_cleanup(umem_cache_tmem_cleanup);
+
+ if (umem_genasm_supported && !(umem_flags & UMF_DEBUG) &&
+ !(umem_flags & UMF_NOMAGAZINE) &&
+ umem_ptc_size > 0) {
+ umem_ptc_enabled = umem_genasm(umem_alloc_sizes,
+ umem_alloc_caches, i) == 0 ? 1 : 0;
+ }
+
/*
* Initialization cannot fail at this point. Make the caches
* visible to umem_alloc() and friends.
diff --git a/usr/src/lib/libumem/common/umem_base.h b/usr/src/lib/libumem/common/umem_base.h
index e78bebfb58..26e00bc282 100644
--- a/usr/src/lib/libumem/common/umem_base.h
+++ b/usr/src/lib/libumem/common/umem_base.h
@@ -22,12 +22,13 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
#ifndef _UMEM_BASE_H
#define _UMEM_BASE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <umem_impl.h>
#ifdef __cplusplus
@@ -75,6 +76,8 @@ extern volatile uint32_t umem_reaping;
#define UMEM_REAP_ADDING 0x00000001 /* umem_reap() is active */
#define UMEM_REAP_ACTIVE 0x00000002 /* update thread is reaping */
+extern uintptr_t umem_tmem_off;
+
/*
* umem.c: tunables
*/
@@ -97,6 +100,7 @@ extern size_t umem_lite_minsize;
extern size_t umem_lite_maxalign;
extern size_t umem_maxverify;
extern size_t umem_minfirewall;
+extern size_t umem_ptc_size;
extern uint32_t umem_flags;
@@ -139,6 +143,12 @@ extern int umem_create_update_thread(void);
void umem_setup_envvars(int);
void umem_process_envvars(void);
+/*
+ * umem_genasm.c: private interfaces
+ */
+extern int umem_genasm_supported;
+extern int umem_genasm(int *, umem_cache_t **, int);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/lib/libumem/common/umem_impl.h b/usr/src/lib/libumem/common/umem_impl.h
index c6481d9751..f63246e166 100644
--- a/usr/src/lib/libumem/common/umem_impl.h
+++ b/usr/src/lib/libumem/common/umem_impl.h
@@ -24,11 +24,13 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
+ */
+
#ifndef _UMEM_IMPL_H
#define _UMEM_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <umem.h>
#include <sys/sysmacros.h>
@@ -64,6 +66,7 @@ extern "C" {
#define UMF_HASH 0x00000200 /* cache has hash table */
#define UMF_RANDOMIZE 0x00000400 /* randomize other umem_flags */
+#define UMF_PTC 0x00000800 /* cache has per-thread caching */
#define UMF_BUFTAG (UMF_DEADBEEF | UMF_REDZONE)
#define UMF_TOUCH (UMF_BUFTAG | UMF_LITE | UMF_CONTENTS)
@@ -353,7 +356,7 @@ typedef struct umem_cpu {
uint32_t cpu_number;
} umem_cpu_t;
-#define UMEM_MAXBUF 16384
+#define UMEM_MAXBUF 131072
#define UMEM_ALIGN 8 /* min guaranteed alignment */
#define UMEM_ALIGN_SHIFT 3 /* log2(UMEM_ALIGN) */
@@ -396,6 +399,13 @@ extern void umem_startup(caddr_t, size_t, size_t, caddr_t, caddr_t);
extern int umem_add(caddr_t, size_t);
#endif
+/*
+ * Private interface with libc for tcumem.
+ */
+extern uintptr_t _tmem_get_base(void);
+extern int _tmem_get_nentries(void);
+extern void _tmem_set_cleanup(void(*)(void *, int));
+
#ifdef __cplusplus
}
#endif