diff options
Diffstat (limited to 'usr/src')
36 files changed, 3719 insertions, 546 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files index 8bf1c1b520..661a69e561 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/Makefile.files +++ b/usr/src/cmd/mdb/common/modules/genunix/Makefile.files @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -32,6 +32,7 @@ GENUNIX_SRCS = \ avl.c \ bio.c \ + combined.c \ contract.c \ cpupart.c \ ctxop.c \ diff --git a/usr/src/cmd/mdb/common/modules/genunix/avl.c b/usr/src/cmd/mdb/common/modules/genunix/avl.c index b10856cfc3..444af78fa1 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/avl.c +++ b/usr/src/cmd/mdb/common/modules/genunix/avl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,8 +30,12 @@ #include <mdb/mdb_modapi.h> struct aw_info { - void *aw_buff; /* buffer to hold the tree's data structure */ + void *aw_buff; /* buffer to hold tree element */ avl_tree_t aw_tree; /* copy of avl_tree_t being walked */ + uintptr_t aw_end; /* last node in specified range */ + const char *aw_elem_name; + int (*aw_elem_check)(void *, uintptr_t, void *); + void *aw_elem_check_arg; }; /* @@ -40,14 +43,15 @@ struct aw_info { * an AVL node */ static uintptr_t -avl_leftmostchild(uintptr_t addr, void * buff, size_t offset, size_t size) +avl_leftmostchild(uintptr_t addr, void *buff, size_t offset, size_t size, + const char *elem_name) { avl_node_t *node = (avl_node_t *)((uintptr_t)buff + offset); for (;;) { addr -= offset; if (mdb_vread(buff, size, addr) == -1) { - mdb_warn("read of avl_node_t failed: %p", addr); + mdb_warn("failed to read %s at %#lx", elem_name, addr); return ((uintptr_t)-1L); } if (node->avl_child[0] == NULL) @@ -59,14 +63,32 @@ avl_leftmostchild(uintptr_t addr, void * buff, size_t offset, size_t size) /* * initialize a forward walk thru an avl tree. + * + * begin and end optionally specify objects other than the first and last + * objects in the tree; either or both may be NULL (defaulting to first and + * last). + * + * avl_name and element_name specify command-specific labels other than + * "avl_tree_t" and "tree element" for use in error messages. + * + * element_check() returns -1, 1, or 0: abort the walk with an error, stop + * without an error, or allow the normal callback; arg is an optional user + * argument to element_check(). */ int -avl_walk_init(mdb_walk_state_t *wsp) +avl_walk_init_range(mdb_walk_state_t *wsp, uintptr_t begin, uintptr_t end, + const char *avl_name, const char *element_name, + int (*element_check)(void *, uintptr_t, void *), void *arg) { struct aw_info *aw; avl_tree_t *tree; uintptr_t addr; + if (avl_name == NULL) + avl_name = "avl_tree_t"; + if (element_name == NULL) + element_name = "tree element"; + /* * allocate the AVL walk data */ @@ -77,7 +99,7 @@ avl_walk_init(mdb_walk_state_t *wsp) */ tree = &aw->aw_tree; if (mdb_vread(tree, sizeof (avl_tree_t), wsp->walk_addr) == -1) { - mdb_warn("read of avl_tree_t failed: %p", wsp->walk_addr); + mdb_warn("failed to read %s at %#lx", avl_name, wsp->walk_addr); goto error; } if (tree->avl_size < tree->avl_offset + sizeof (avl_node_t)) { @@ -91,22 +113,30 @@ avl_walk_init(mdb_walk_state_t *wsp) * "node" always points at the avl_node_t field inside the struct */ aw->aw_buff = mdb_zalloc(tree->avl_size, UM_SLEEP); + aw->aw_end = (end == NULL ? NULL : end + tree->avl_offset); + aw->aw_elem_name = element_name; + aw->aw_elem_check = element_check; + aw->aw_elem_check_arg = arg; /* * get the first avl_node_t address, use same algorithm * as avl_start() -- leftmost child in tree from root */ - addr = (uintptr_t)tree->avl_root; - if (addr == NULL) { - wsp->walk_addr = NULL; - return (WALK_NEXT); + if (begin == NULL) { + addr = (uintptr_t)tree->avl_root; + if (addr == NULL) { + wsp->walk_addr = NULL; + return (WALK_NEXT); + } + addr = avl_leftmostchild(addr, aw->aw_buff, tree->avl_offset, + tree->avl_size, aw->aw_elem_name); + if (addr == (uintptr_t)-1L) + goto error; + wsp->walk_addr = addr; + } else { + wsp->walk_addr = begin + tree->avl_offset; } - addr = avl_leftmostchild(addr, aw->aw_buff, tree->avl_offset, - tree->avl_size); - if (addr == (uintptr_t)-1L) - goto error; - wsp->walk_addr = addr; return (WALK_NEXT); error: @@ -116,6 +146,29 @@ error: return (WALK_ERR); } +int +avl_walk_init(mdb_walk_state_t *wsp) +{ + return (avl_walk_init_range(wsp, NULL, NULL, NULL, NULL, NULL, NULL)); +} + +int +avl_walk_init_named(mdb_walk_state_t *wsp, + const char *avl_name, const char *element_name) +{ + return (avl_walk_init_range(wsp, NULL, NULL, avl_name, element_name, + NULL, NULL)); +} + +int +avl_walk_init_checked(mdb_walk_state_t *wsp, + const char *avl_name, const char *element_name, + int (*element_check)(void *, uintptr_t, void *), void *arg) +{ + return (avl_walk_init_range(wsp, NULL, NULL, avl_name, element_name, + element_check, arg)); +} + /* * At each step, visit (callback) the current node, then move to the next * in the AVL tree. Uses the same algorithm as avl_walk(). @@ -139,6 +192,10 @@ avl_walk_step(mdb_walk_state_t *wsp) return (WALK_DONE); aw = (struct aw_info *)wsp->walk_data; + + if (aw->aw_end != NULL && wsp->walk_addr == aw->aw_end) + return (WALK_DONE); + size = aw->aw_tree.avl_size; offset = aw->aw_tree.avl_offset; node = (avl_node_t *)((uintptr_t)aw->aw_buff + offset); @@ -147,10 +204,19 @@ avl_walk_step(mdb_walk_state_t *wsp) * must read the current node for the call back to use */ if (mdb_vread(aw->aw_buff, size, addr) == -1) { - mdb_warn("read of avl_node_t failed: %p", addr); + mdb_warn("failed to read %s at %#lx", aw->aw_elem_name, addr); return (WALK_ERR); } + if (aw->aw_elem_check != NULL) { + int rc = aw->aw_elem_check(aw->aw_buff, addr, + aw->aw_elem_check_arg); + if (rc == -1) + return (WALK_ERR); + else if (rc == 1) + return (WALK_DONE); + } + /* * do the call back */ @@ -169,7 +235,8 @@ avl_walk_step(mdb_walk_state_t *wsp) */ addr = (uintptr_t)node->avl_child[1]; if (addr != NULL) { - addr = avl_leftmostchild(addr, aw->aw_buff, offset, size); + addr = avl_leftmostchild(addr, aw->aw_buff, offset, size, + aw->aw_elem_name); if (addr == (uintptr_t)-1L) return (WALK_ERR); @@ -187,7 +254,8 @@ avl_walk_step(mdb_walk_state_t *wsp) if (was_child == 0) /* stop on return from left child */ break; if (mdb_vread(aw->aw_buff, size, addr) == -1) { - mdb_warn("read of avl_node_t failed: %p", addr); + mdb_warn("failed to read %s at %#lx", + aw->aw_elem_name, addr); return (WALK_ERR); } } diff --git a/usr/src/cmd/mdb/common/modules/genunix/avl.h b/usr/src/cmd/mdb/common/modules/genunix/avl.h index 1d2e9dcb88..6f3bc202b7 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/avl.h +++ b/usr/src/cmd/mdb/common/modules/genunix/avl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +37,14 @@ extern "C" { "entries in tree" extern int avl_walk_init(mdb_walk_state_t *); +extern int avl_walk_init_named(mdb_walk_state_t *wsp, + const char *, const char *); +extern int avl_walk_init_checked(mdb_walk_state_t *wsp, + const char *, const char *, + int (*)(void *, uintptr_t, void *), void *); +extern int avl_walk_init_range(mdb_walk_state_t *wsp, uintptr_t, uintptr_t, + const char *, const char *, + int (*)(void *, uintptr_t, void *), void *); extern int avl_walk_step(mdb_walk_state_t *); extern void avl_walk_fini(mdb_walk_state_t *wsp); diff --git a/usr/src/cmd/mdb/common/modules/genunix/combined.c b/usr/src/cmd/mdb/common/modules/genunix/combined.c new file mode 100644 index 0000000000..412761a7bd --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/genunix/combined.c @@ -0,0 +1,165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <mdb/mdb_modapi.h> + +typedef struct combined_walk { + int (*cw_init)(mdb_walk_state_t *); + int (*cw_step)(mdb_walk_state_t *); + void (*cw_fini)(mdb_walk_state_t *); + struct combined_walk *cw_next; + void *cw_data; + boolean_t cw_initialized; +} combined_walk_t; + +typedef struct combined_walk_data { + uintptr_t cwd_initial_walk_addr; /* to init each walk */ + combined_walk_t *cwd_current_walk; + combined_walk_t *cwd_final_walk; /* tail pointer */ +} combined_walk_data_t; + +/* + * Initialize a combined walk to + * A) present a single concatenated series of elements from different + * structures, or + * B) select from several possible walks at runtime. + * Multiple walks are done in the same order passed to combined_walk_add(). Each + * walk is initialized with the same wsp->walk_addr. + */ +void +combined_walk_init(mdb_walk_state_t *wsp) +{ + combined_walk_data_t *cwd; + + cwd = mdb_alloc(sizeof (combined_walk_data_t), UM_SLEEP); + + cwd->cwd_initial_walk_addr = wsp->walk_addr; + cwd->cwd_current_walk = cwd->cwd_final_walk = NULL; + wsp->walk_data = cwd; +} + +static void +combined_walk_append(combined_walk_data_t *cwd, combined_walk_t *cw) +{ + if (cwd->cwd_final_walk == NULL) { + cwd->cwd_current_walk = cwd->cwd_final_walk = cw; + } else { + cwd->cwd_final_walk->cw_next = cw; + cwd->cwd_final_walk = cw; + } +} + +static combined_walk_t * +combined_walk_remove_current(combined_walk_data_t *cwd) +{ + combined_walk_t *cw = cwd->cwd_current_walk; + if (cw == NULL) { + return (NULL); + } + if (cw == cwd->cwd_final_walk) { + cwd->cwd_final_walk = cw->cw_next; + } + cwd->cwd_current_walk = cw->cw_next; + cw->cw_next = NULL; + return (cw); +} + +void +combined_walk_add(mdb_walk_state_t *wsp, + int (*walk_init)(mdb_walk_state_t *), + int (*walk_step)(mdb_walk_state_t *), + void (*walk_fini)(mdb_walk_state_t *)) +{ + combined_walk_data_t *cwd = wsp->walk_data; + combined_walk_t *cw; + + cw = mdb_alloc(sizeof (combined_walk_t), UM_SLEEP); + + cw->cw_init = walk_init; + cw->cw_step = walk_step; + cw->cw_fini = walk_fini; + cw->cw_next = NULL; + cw->cw_data = NULL; + cw->cw_initialized = B_FALSE; + + combined_walk_append(cwd, cw); +} + +int +combined_walk_step(mdb_walk_state_t *wsp) +{ + combined_walk_data_t *cwd = wsp->walk_data; + combined_walk_t *cw = cwd->cwd_current_walk; + int status; + + if (cw == NULL) { + return (WALK_DONE); + } + + if (cw->cw_initialized) { + wsp->walk_data = cw->cw_data; + } else { + wsp->walk_addr = cwd->cwd_initial_walk_addr; + status = cw->cw_init(wsp); + cw->cw_data = wsp->walk_data; + cw->cw_initialized = B_TRUE; + if (status != WALK_NEXT) { + wsp->walk_data = cwd; + return (status); + } + } + + status = cw->cw_step(wsp); + + if (status == WALK_DONE) { + (void) combined_walk_remove_current(cwd); + cw->cw_fini(wsp); + mdb_free(cw, sizeof (combined_walk_t)); + wsp->walk_data = cwd; + return (combined_walk_step(wsp)); + } + + wsp->walk_data = cwd; + return (status); +} + +void +combined_walk_fini(mdb_walk_state_t *wsp) +{ + combined_walk_data_t *cwd = wsp->walk_data; + combined_walk_t *cw; + + while ((cw = combined_walk_remove_current(cwd)) != NULL) { + if (cw->cw_initialized) { + wsp->walk_data = cw->cw_data; + cw->cw_fini(wsp); + } + mdb_free(cw, sizeof (combined_walk_t)); + } + + mdb_free(cwd, sizeof (combined_walk_data_t)); +} diff --git a/usr/src/cmd/mdb/common/modules/genunix/combined.h b/usr/src/cmd/mdb/common/modules/genunix/combined.h new file mode 100644 index 0000000000..f4f48c7b54 --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/genunix/combined.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _COMBINED_H +#define _COMBINED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <mdb/mdb_modapi.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void combined_walk_init(mdb_walk_state_t *wsp); +extern void combined_walk_add(mdb_walk_state_t *wsp, + int (*walk_init)(mdb_walk_state_t *), + int (*walk_step)(mdb_walk_state_t *), + void (*walk_fini)(mdb_walk_state_t *)); +extern int combined_walk_step(mdb_walk_state_t *wsp); +extern void combined_walk_fini(mdb_walk_state_t *wsp); + +#ifdef __cplusplus +} +#endif + +#endif /* _COMBINED_H */ diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index 36d61da041..8fb79f02b6 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -69,6 +69,7 @@ #include <sys/port_impl.h> #include "avl.h" +#include "combined.h" #include "contract.h" #include "cpupart_mdb.h" #include "devinfo.h" @@ -3370,9 +3371,10 @@ static const mdb_dcmd_t dcmds[] = { kmastat }, { "kmausers", "?[-ef] [cache ...]", "current medium and large users " "of the kmem allocator", kmausers, kmausers_help }, - { "kmem_cache", "?", "print kernel memory caches", kmem_cache }, - { "kmem_slabs", "?[-v] [-n cache] [-b maxbins] [-B minbinsize]", - "display slab usage per kmem cache", + { "kmem_cache", "?[-n name]", + "print kernel memory caches", kmem_cache, kmem_cache_help}, + { "kmem_slabs", "?[-v] [-n cache] [-N cache] [-b maxbins] " + "[-B minbinsize]", "display slab usage per kmem cache", kmem_slabs, kmem_slabs_help }, { "kmem_debug", NULL, "toggle kmem dcmd/walk debugging", kmem_debug }, { "kmem_log", "?[-b]", "dump kmem transaction log", kmem_log }, @@ -3705,10 +3707,11 @@ static const mdb_walker_t walkers[] = { { "kmem_log", "walk the kmem transaction log", kmem_log_walk_init, kmem_log_walk_step, kmem_log_walk_fini }, { "kmem_slab", "given a kmem cache, walk its slabs", - kmem_slab_walk_init, kmem_slab_walk_step, NULL }, + kmem_slab_walk_init, combined_walk_step, combined_walk_fini }, { "kmem_slab_partial", "given a kmem cache, walk its partially allocated slabs (min 1)", - kmem_slab_walk_partial_init, kmem_slab_walk_step, NULL }, + kmem_slab_walk_partial_init, combined_walk_step, + combined_walk_fini }, { "vmem", "walk vmem structures in pre-fix, depth-first order", vmem_walk_init, vmem_walk_step, vmem_walk_fini }, { "vmem_alloc", "given a vmem_t, walk its allocated vmem_segs", diff --git a/usr/src/cmd/mdb/common/modules/genunix/kmem.c b/usr/src/cmd/mdb/common/modules/genunix/kmem.c index 47ad09126f..8d4c3b8638 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/kmem.c +++ b/usr/src/cmd/mdb/common/modules/genunix/kmem.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,9 +39,12 @@ #include <sys/sysmacros.h> #include <vm/page.h> +#include "avl.h" +#include "combined.h" #include "dist.h" #include "kmem.h" #include "leaky.h" +#include "list.h" #define dprintf(x) if (mdb_debug_level) { \ mdb_printf("kmem debug: "); \ @@ -92,65 +95,19 @@ kmem_debug(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } -typedef struct { - uintptr_t kcw_first; - uintptr_t kcw_current; -} kmem_cache_walk_t; - int kmem_cache_walk_init(mdb_walk_state_t *wsp) { - kmem_cache_walk_t *kcw; - kmem_cache_t c; - uintptr_t cp; GElf_Sym sym; - if (mdb_lookup_by_name("kmem_null_cache", &sym) == -1) { - mdb_warn("couldn't find kmem_null_cache"); + if (mdb_lookup_by_name("kmem_caches", &sym) == -1) { + mdb_warn("couldn't find kmem_caches"); return (WALK_ERR); } - cp = (uintptr_t)sym.st_value; - - if (mdb_vread(&c, sizeof (kmem_cache_t), cp) == -1) { - mdb_warn("couldn't read cache at %p", cp); - return (WALK_ERR); - } + wsp->walk_addr = (uintptr_t)sym.st_value; - kcw = mdb_alloc(sizeof (kmem_cache_walk_t), UM_SLEEP); - - kcw->kcw_first = cp; - kcw->kcw_current = (uintptr_t)c.cache_next; - wsp->walk_data = kcw; - - return (WALK_NEXT); -} - -int -kmem_cache_walk_step(mdb_walk_state_t *wsp) -{ - kmem_cache_walk_t *kcw = wsp->walk_data; - kmem_cache_t c; - int status; - - if (mdb_vread(&c, sizeof (kmem_cache_t), kcw->kcw_current) == -1) { - mdb_warn("couldn't read cache at %p", kcw->kcw_current); - return (WALK_DONE); - } - - status = wsp->walk_callback(kcw->kcw_current, &c, wsp->walk_cbdata); - - if ((kcw->kcw_current = (uintptr_t)c.cache_next) == kcw->kcw_first) - return (WALK_DONE); - - return (status); -} - -void -kmem_cache_walk_fini(mdb_walk_state_t *wsp) -{ - kmem_cache_walk_t *kcw = wsp->walk_data; - mdb_free(kcw, sizeof (kmem_cache_walk_t)); + return (list_walk_init_named(wsp, "cache list", "cache")); } int @@ -188,29 +145,134 @@ kmem_cpu_cache_walk_step(mdb_walk_state_t *wsp) return (wsp->walk_callback(caddr, &cc, wsp->walk_cbdata)); } +static int +kmem_slab_check(void *p, uintptr_t saddr, void *arg) +{ + kmem_slab_t *sp = p; + uintptr_t caddr = (uintptr_t)arg; + if ((uintptr_t)sp->slab_cache != caddr) { + mdb_warn("slab %p isn't in cache %p (in cache %p)\n", + saddr, caddr, sp->slab_cache); + return (-1); + } + + return (0); +} + +static int +kmem_partial_slab_check(void *p, uintptr_t saddr, void *arg) +{ + kmem_slab_t *sp = p; + + int rc = kmem_slab_check(p, saddr, arg); + if (rc != 0) { + return (rc); + } + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + mdb_warn("slab %p is not a partial slab\n", saddr); + return (-1); + } + + return (0); +} + +static int +kmem_complete_slab_check(void *p, uintptr_t saddr, void *arg) +{ + kmem_slab_t *sp = p; + + int rc = kmem_slab_check(p, saddr, arg); + if (rc != 0) { + return (rc); + } + + if (!KMEM_SLAB_IS_ALL_USED(sp)) { + mdb_warn("slab %p is not completely allocated\n", saddr); + return (-1); + } + + return (0); +} + +typedef struct { + uintptr_t kns_cache_addr; + int kns_nslabs; +} kmem_nth_slab_t; + +static int +kmem_nth_slab_check(void *p, uintptr_t saddr, void *arg) +{ + kmem_nth_slab_t *chkp = arg; + + int rc = kmem_slab_check(p, saddr, (void *)chkp->kns_cache_addr); + if (rc != 0) { + return (rc); + } + + return (chkp->kns_nslabs-- == 0 ? 1 : 0); +} + +static int +kmem_complete_slab_walk_init(mdb_walk_state_t *wsp) +{ + uintptr_t caddr = wsp->walk_addr; + + wsp->walk_addr = (uintptr_t)(caddr + + offsetof(kmem_cache_t, cache_complete_slabs)); + + return (list_walk_init_checked(wsp, "slab list", "slab", + kmem_complete_slab_check, (void *)caddr)); +} + +static int +kmem_partial_slab_walk_init(mdb_walk_state_t *wsp) +{ + uintptr_t caddr = wsp->walk_addr; + + wsp->walk_addr = (uintptr_t)(caddr + + offsetof(kmem_cache_t, cache_partial_slabs)); + + return (avl_walk_init_checked(wsp, "slab list", "slab", + kmem_partial_slab_check, (void *)caddr)); +} + int kmem_slab_walk_init(mdb_walk_state_t *wsp) { uintptr_t caddr = wsp->walk_addr; - kmem_cache_t c; if (caddr == NULL) { mdb_warn("kmem_slab doesn't support global walks\n"); return (WALK_ERR); } - if (mdb_vread(&c, sizeof (c), caddr) == -1) { - mdb_warn("couldn't read kmem_cache at %p", caddr); - return (WALK_ERR); - } - - wsp->walk_data = - (void *)(caddr + offsetof(kmem_cache_t, cache_nullslab)); - wsp->walk_addr = (uintptr_t)c.cache_nullslab.slab_next; + combined_walk_init(wsp); + combined_walk_add(wsp, + kmem_complete_slab_walk_init, list_walk_step, list_walk_fini); + combined_walk_add(wsp, + kmem_partial_slab_walk_init, avl_walk_step, avl_walk_fini); return (WALK_NEXT); } +static int +kmem_first_complete_slab_walk_init(mdb_walk_state_t *wsp) +{ + uintptr_t caddr = wsp->walk_addr; + kmem_nth_slab_t *chk; + + chk = mdb_alloc(sizeof (kmem_nth_slab_t), + UM_SLEEP | UM_GC); + chk->kns_cache_addr = caddr; + chk->kns_nslabs = 1; + wsp->walk_addr = (uintptr_t)(caddr + + offsetof(kmem_cache_t, cache_complete_slabs)); + + return (list_walk_init_checked(wsp, "slab list", "slab", + kmem_nth_slab_check, chk)); +} + int kmem_slab_walk_partial_init(mdb_walk_state_t *wsp) { @@ -227,55 +289,38 @@ kmem_slab_walk_partial_init(mdb_walk_state_t *wsp) return (WALK_ERR); } - wsp->walk_data = - (void *)(caddr + offsetof(kmem_cache_t, cache_nullslab)); - wsp->walk_addr = (uintptr_t)c.cache_freelist; + combined_walk_init(wsp); /* * Some consumers (umem_walk_step(), in particular) require at * least one callback if there are any buffers in the cache. So - * if there are *no* partial slabs, report the last full slab, if + * if there are *no* partial slabs, report the first full slab, if * any. * * Yes, this is ugly, but it's cleaner than the other possibilities. */ - if ((uintptr_t)wsp->walk_data == wsp->walk_addr) - wsp->walk_addr = (uintptr_t)c.cache_nullslab.slab_prev; - - return (WALK_NEXT); -} - -int -kmem_slab_walk_step(mdb_walk_state_t *wsp) -{ - kmem_slab_t s; - uintptr_t addr = wsp->walk_addr; - uintptr_t saddr = (uintptr_t)wsp->walk_data; - uintptr_t caddr = saddr - offsetof(kmem_cache_t, cache_nullslab); - - if (addr == saddr) - return (WALK_DONE); - - if (mdb_vread(&s, sizeof (s), addr) == -1) { - mdb_warn("failed to read slab at %p", wsp->walk_addr); - return (WALK_ERR); - } - - if ((uintptr_t)s.slab_cache != caddr) { - mdb_warn("slab %p isn't in cache %p (in cache %p)\n", - addr, caddr, s.slab_cache); - return (WALK_ERR); + if (c.cache_partial_slabs.avl_numnodes == 0) { + combined_walk_add(wsp, kmem_first_complete_slab_walk_init, + list_walk_step, list_walk_fini); + } else { + combined_walk_add(wsp, kmem_partial_slab_walk_init, + avl_walk_step, avl_walk_fini); } - wsp->walk_addr = (uintptr_t)s.slab_next; - - return (wsp->walk_callback(addr, &s, wsp->walk_cbdata)); + return (WALK_NEXT); } int kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv) { kmem_cache_t c; + const char *filter = NULL; + + if (mdb_getopts(ac, argv, + 'n', MDB_OPT_STR, &filter, + NULL) != ac) { + return (DCMD_USAGE); + } if (!(flags & DCMD_ADDRSPEC)) { if (mdb_walk_dcmd("kmem_cache", "kmem_cache", ac, argv) == -1) { @@ -294,25 +339,35 @@ kmem_cache(uintptr_t addr, uint_t flags, int ac, const mdb_arg_t *argv) return (DCMD_ERR); } + if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL)) + return (DCMD_OK); + mdb_printf("%0?p %-25s %04x %06x %8ld %8lld\n", addr, c.cache_name, c.cache_flags, c.cache_cflags, c.cache_bufsize, c.cache_buftotal); return (DCMD_OK); } -typedef struct kmem_slab_usage { - int ksu_refcnt; /* count of allocated buffers on slab */ -} kmem_slab_usage_t; - -typedef struct kmem_slab_stats { - int ks_slabs; /* slabs in cache */ - int ks_partial_slabs; /* partially allocated slabs in cache */ - uint64_t ks_unused_buffers; /* total unused buffers in cache */ - int ks_buffers_per_slab; /* buffers per slab */ - int ks_usage_len; /* ks_usage array length */ - kmem_slab_usage_t *ks_usage; /* partial slab usage */ - uint_t *ks_bucket; /* slab usage distribution */ -} kmem_slab_stats_t; +void +kmem_cache_help(void) +{ + mdb_printf("%s", "Print kernel memory caches.\n\n"); + mdb_dec_indent(2); + mdb_printf("%<b>OPTIONS%</b>\n"); + mdb_inc_indent(2); + mdb_printf("%s", +" -n name\n" +" name of kmem cache (or matching partial name)\n" +"\n" +"Column\tDescription\n" +"\n" +"ADDR\t\taddress of kmem cache\n" +"NAME\t\tname of kmem cache\n" +"FLAG\t\tvarious cache state flags\n" +"CFLAG\t\tcache creation flags\n" +"BUFSIZE\tobject size in bytes\n" +"BUFTOTL\tcurrent total buffers in cache (allocated and free)\n"); +} #define LABEL_WIDTH 11 static void @@ -388,15 +443,30 @@ kmem_first_partial_slab(uintptr_t addr, const kmem_slab_t *sp, boolean_t *is_slab) { /* - * The "kmem_partial_slab" walker reports the last full slab if there + * The "kmem_partial_slab" walker reports the first full slab if there * are no partial slabs (for the sake of consumers that require at least * one callback if there are any buffers in the cache). */ - *is_slab = ((sp->slab_refcnt > 0) && - (sp->slab_refcnt < sp->slab_chunks)); + *is_slab = KMEM_SLAB_IS_PARTIAL(sp); return (WALK_DONE); } +typedef struct kmem_slab_usage { + int ksu_refcnt; /* count of allocated buffers on slab */ + boolean_t ksu_nomove; /* slab marked non-reclaimable */ +} kmem_slab_usage_t; + +typedef struct kmem_slab_stats { + const kmem_cache_t *ks_cp; + int ks_slabs; /* slabs in cache */ + int ks_partial_slabs; /* partially allocated slabs in cache */ + uint64_t ks_unused_buffers; /* total unused buffers in cache */ + int ks_max_buffers_per_slab; /* max buffers per slab */ + int ks_usage_len; /* ks_usage array length */ + kmem_slab_usage_t *ks_usage; /* partial slab usage */ + uint_t *ks_bucket; /* slab usage distribution */ +} kmem_slab_stats_t; + /*ARGSUSED*/ static int kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp, @@ -406,12 +476,6 @@ kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp, long unused; ks->ks_slabs++; - if (ks->ks_buffers_per_slab == 0) { - ks->ks_buffers_per_slab = sp->slab_chunks; - /* +1 to include a zero bucket */ - ks->ks_bucket = mdb_zalloc((ks->ks_buffers_per_slab + 1) * - sizeof (*ks->ks_bucket), UM_SLEEP | UM_GC); - } ks->ks_bucket[sp->slab_refcnt]++; unused = (sp->slab_chunks - sp->slab_refcnt); @@ -440,6 +504,7 @@ kmem_slablist_stat(uintptr_t addr, const kmem_slab_t *sp, ksu = &ks->ks_usage[ks->ks_partial_slabs - 1]; ksu->ksu_refcnt = sp->slab_refcnt; + ksu->ksu_nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE); return (WALK_NEXT); } @@ -466,21 +531,23 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) size_t maxbuckets = 1; size_t minbucketsize = 0; const char *filter = NULL; + const char *name = NULL; uint_t opt_v = FALSE; - boolean_t verbose = B_FALSE; + boolean_t buckets = B_FALSE; boolean_t skip = B_FALSE; if (mdb_getopts(argc, argv, 'B', MDB_OPT_UINTPTR, &minbucketsize, 'b', MDB_OPT_UINTPTR, &maxbuckets, 'n', MDB_OPT_STR, &filter, + 'N', MDB_OPT_STR, &name, 'v', MDB_OPT_SETBITS, TRUE, &opt_v, NULL) != argc) { return (DCMD_USAGE); } - if (opt_v || (maxbuckets != 1) || (minbucketsize != 0)) { - verbose = 1; + if ((maxbuckets != 1) || (minbucketsize != 0)) { + buckets = B_TRUE; } if (!(flags & DCMD_ADDRSPEC)) { @@ -497,13 +564,20 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_ERR); } - if ((filter != NULL) && (strstr(c.cache_name, filter) == NULL)) { - skip = B_TRUE; + if (name == NULL) { + skip = ((filter != NULL) && + (strstr(c.cache_name, filter) == NULL)); + } else if (filter == NULL) { + skip = (strcmp(c.cache_name, name) != 0); + } else { + /* match either -n or -N */ + skip = ((strcmp(c.cache_name, name) != 0) && + (strstr(c.cache_name, filter) == NULL)); } - if (!verbose && DCMD_HDRSPEC(flags)) { + if (!(opt_v || buckets) && DCMD_HDRSPEC(flags)) { kmem_slabs_header(); - } else if (verbose && !skip) { + } else if ((opt_v || buckets) && !skip) { if (DCMD_HDRSPEC(flags)) { kmem_slabs_header(); } else { @@ -528,6 +602,11 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } bzero(&stats, sizeof (kmem_slab_stats_t)); + stats.ks_cp = &c; + stats.ks_max_buffers_per_slab = c.cache_maxchunks; + /* +1 to include a zero bucket */ + stats.ks_bucket = mdb_zalloc((stats.ks_max_buffers_per_slab + 1) * + sizeof (*stats.ks_bucket), UM_SLEEP); cb = (mdb_walk_cb_t)kmem_slablist_stat; (void) mdb_pwalk("kmem_slab", cb, &stats, addr); @@ -550,19 +629,22 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) stats.ks_slabs, stats.ks_partial_slabs, c.cache_buftotal, stats.ks_unused_buffers, pct, tenths_pct); - if (!verbose) { - return (DCMD_OK); - } - if (maxbuckets == 0) { - maxbuckets = stats.ks_buffers_per_slab; + maxbuckets = stats.ks_max_buffers_per_slab; } if (((maxbuckets > 1) || (minbucketsize > 0)) && (stats.ks_slabs > 0)) { mdb_printf("\n"); kmem_slabs_print_dist(stats.ks_bucket, - stats.ks_buffers_per_slab, maxbuckets, minbucketsize); + stats.ks_max_buffers_per_slab, maxbuckets, minbucketsize); + } + + mdb_free(stats.ks_bucket, (stats.ks_max_buffers_per_slab + 1) * + sizeof (*stats.ks_bucket)); + + if (!opt_v) { + return (DCMD_OK); } if (opt_v && (stats.ks_partial_slabs > 0)) { @@ -573,11 +655,16 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) (stats.ks_slabs - stats.ks_partial_slabs), stats.ks_partial_slabs); if (stats.ks_partial_slabs > 0) { - mdb_printf(" (%d):", stats.ks_buffers_per_slab); + mdb_printf(" (%d):", stats.ks_max_buffers_per_slab); } for (i = 0; i < stats.ks_partial_slabs; i++) { ksu = &stats.ks_usage[i]; - mdb_printf(" %d", ksu->ksu_refcnt); + if (ksu->ksu_nomove) { + const char *symbol = "*"; + mdb_printf(" %d%s", ksu->ksu_refcnt, symbol); + } else { + mdb_printf(" %d", ksu->ksu_refcnt); + } } mdb_printf("\n\n"); } @@ -593,14 +680,16 @@ kmem_slabs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) void kmem_slabs_help(void) { - mdb_printf("%s\n", -"Display slab usage per kmem cache.\n"); + mdb_printf("%s", +"Display slab usage per kmem cache.\n\n"); mdb_dec_indent(2); mdb_printf("%<b>OPTIONS%</b>\n"); mdb_inc_indent(2); mdb_printf("%s", " -n name\n" " name of kmem cache (or matching partial name)\n" +" -N name\n" +" exact name of kmem cache\n" " -b maxbins\n" " Print a distribution of allocated buffers per slab using at\n" " most maxbins bins. The first bin is reserved for completely\n" @@ -629,9 +718,19 @@ kmem_slabs_help(void) " list and least-used at the back (as in the example above).\n" " However, if a slab contains an allocated buffer that will not\n" " soon be freed, it would be better for that slab to be at the\n" -" front where it can get used up. Taking a slab off the partial\n" -" slab list (either with all buffers freed or all buffers\n" -" allocated) reduces cache fragmentation.\n" +" front where all of its buffers can be allocated. Taking a slab\n" +" off the partial slab list (either with all buffers freed or all\n" +" buffers allocated) reduces cache fragmentation.\n" +"\n" +" A slab's allocated buffer count representing a partial slab (9 in\n" +" the example below) may be marked as follows:\n" +"\n" +" 9* An asterisk indicates that kmem has marked the slab non-\n" +" reclaimable because the kmem client refused to move one of the\n" +" slab's buffers. Since kmem does not expect to completely free the\n" +" slab, it moves it to the front of the list in the hope of\n" +" completely allocating it instead. A slab marked with an asterisk\n" +" stays marked for as long as it remains on the partial slab list.\n" "\n" "Column\t\tDescription\n" "\n" @@ -2729,8 +2828,8 @@ bufctl_history_callback(uintptr_t addr, const void *ign, void *arg) void bufctl_help(void) { - mdb_printf("%s\n", -"Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n"); + mdb_printf("%s", +"Display the contents of kmem_bufctl_audit_ts, with optional filtering.\n\n"); mdb_dec_indent(2); mdb_printf("%<b>OPTIONS%</b>\n"); mdb_inc_indent(2); @@ -3509,8 +3608,8 @@ vmem(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) void vmem_seg_help(void) { - mdb_printf("%s\n", -"Display the contents of vmem_seg_ts, with optional filtering.\n" + mdb_printf("%s", +"Display the contents of vmem_seg_ts, with optional filtering.\n\n" "\n" "A vmem_seg_t represents a range of addresses (or arbitrary numbers),\n" "representing a single chunk of data. Only ALLOC segments have debugging\n" @@ -4180,7 +4279,7 @@ kmem_init(void) { mdb_walker_t w = { "kmem_cache", "walk list of kmem caches", kmem_cache_walk_init, - kmem_cache_walk_step, kmem_cache_walk_fini + list_walk_step, list_walk_fini }; /* diff --git a/usr/src/cmd/mdb/common/modules/genunix/kmem.h b/usr/src/cmd/mdb/common/modules/genunix/kmem.h index 3d56413655..2d74dad9e3 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/kmem.h +++ b/usr/src/cmd/mdb/common/modules/genunix/kmem.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,15 +35,12 @@ extern "C" { #endif extern int kmem_cache_walk_init(mdb_walk_state_t *); -extern int kmem_cache_walk_step(mdb_walk_state_t *); -extern void kmem_cache_walk_fini(mdb_walk_state_t *); extern int kmem_cpu_cache_walk_init(mdb_walk_state_t *); extern int kmem_cpu_cache_walk_step(mdb_walk_state_t *); extern int kmem_slab_walk_init(mdb_walk_state_t *); extern int kmem_slab_walk_partial_init(mdb_walk_state_t *); -extern int kmem_slab_walk_step(mdb_walk_state_t *); extern int kmem_hash_walk_init(mdb_walk_state_t *wsp); extern int kmem_hash_walk_step(mdb_walk_state_t *wsp); @@ -104,11 +101,12 @@ extern int vmem(uintptr_t, uint_t, int, const mdb_arg_t *); extern int vmem_seg(uintptr_t, uint_t, int, const mdb_arg_t *); extern int kmalog(uintptr_t, uint_t, int, const mdb_arg_t *); extern int kmausers(uintptr_t, uint_t, int, const mdb_arg_t *); +extern void kmem_cache_help(void); +extern void kmem_slabs_help(void); extern void whatis_help(void); extern void bufctl_help(void); extern void vmem_seg_help(void); extern void kmausers_help(void); -extern void kmem_slabs_help(void); extern int whatthread(uintptr_t, uint_t, int, const mdb_arg_t *); diff --git a/usr/src/cmd/mdb/common/modules/genunix/list.c b/usr/src/cmd/mdb/common/modules/genunix/list.c index e73174026d..58e21ebc6f 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/list.c +++ b/usr/src/cmd/mdb/common/modules/genunix/list.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,21 +29,47 @@ #include <sys/list.h> typedef struct list_walk_data { - uintptr_t lw_start; - size_t lw_size; - size_t lw_offset; - void *lw_obj; + uintptr_t lw_head; /* address of list head */ + size_t lw_size; /* size of list element */ + size_t lw_offset; /* list element linkage offset */ + void *lw_obj; /* buffer of lw_size to hold list element */ + uintptr_t lw_end; /* last node in specified range */ + const char *lw_elem_name; + int (*lw_elem_check)(void *, uintptr_t, void *); + void *lw_elem_check_arg; } list_walk_data_t; +/* + * Initialize a forward walk through a list. + * + * begin and end optionally specify objects other than the first and last + * objects in the list; either or both may be NULL (defaulting to first and + * last). + * + * list_name and element_name specify command-specific labels other than + * "list_t" and "list element" for use in error messages. + * + * element_check() returns -1, 1, or 0: abort the walk with an error, stop + * without an error, or allow the normal callback; arg is an optional user + * argument to element_check(). + */ int -list_walk_init(mdb_walk_state_t *wsp) +list_walk_init_range(mdb_walk_state_t *wsp, uintptr_t begin, uintptr_t end, + const char *list_name, const char *element_name, + int (*element_check)(void *, uintptr_t, void *), void *arg) { list_walk_data_t *lwd; list_t list; + if (list_name == NULL) + list_name = "list_t"; + if (element_name == NULL) + element_name = "list element"; + lwd = mdb_alloc(sizeof (list_walk_data_t), UM_SLEEP); if (mdb_vread(&list, sizeof (list_t), wsp->walk_addr) == -1) { - mdb_warn("failed to read list_t at %#lx", wsp->walk_addr); + mdb_warn("failed to read %s at %#lx", list_name, + wsp->walk_addr); mdb_free(lwd, sizeof (list_walk_data_t)); return (WALK_ERR); } @@ -52,15 +77,44 @@ list_walk_init(mdb_walk_state_t *wsp) lwd->lw_size = list.list_size; lwd->lw_offset = list.list_offset; lwd->lw_obj = mdb_alloc(list.list_size, UM_SLEEP); - lwd->lw_start = (uintptr_t)&((list_t *)wsp->walk_addr)->list_head; + lwd->lw_head = (uintptr_t)&((list_t *)wsp->walk_addr)->list_head; + lwd->lw_end = (end == NULL ? NULL : end + lwd->lw_offset); + lwd->lw_elem_name = element_name; + lwd->lw_elem_check = element_check; + lwd->lw_elem_check_arg = arg; - wsp->walk_addr = (uintptr_t)list.list_head.list_next; + wsp->walk_addr = (begin == NULL + ? (uintptr_t)list.list_head.list_next + : begin + lwd->lw_offset); wsp->walk_data = lwd; return (WALK_NEXT); } int +list_walk_init(mdb_walk_state_t *wsp) +{ + return (list_walk_init_range(wsp, NULL, NULL, NULL, NULL, NULL, NULL)); +} + +int +list_walk_init_named(mdb_walk_state_t *wsp, + const char *list_name, const char *element_name) +{ + return (list_walk_init_range(wsp, NULL, NULL, list_name, element_name, + NULL, NULL)); +} + +int +list_walk_init_checked(mdb_walk_state_t *wsp, + const char *list_name, const char *element_name, + int (*element_check)(void *, uintptr_t, void *), void *arg) +{ + return (list_walk_init_range(wsp, NULL, NULL, list_name, element_name, + element_check, arg)); +} + +int list_walk_step(mdb_walk_state_t *wsp) { list_walk_data_t *lwd = wsp->walk_data; @@ -68,14 +122,26 @@ list_walk_step(mdb_walk_state_t *wsp) list_node_t *node; int status; - if (wsp->walk_addr == lwd->lw_start) + if (wsp->walk_addr == lwd->lw_head) + return (WALK_DONE); + + if (lwd->lw_end != NULL && wsp->walk_addr == lwd->lw_end) return (WALK_DONE); if (mdb_vread(lwd->lw_obj, lwd->lw_size, addr) == -1) { - mdb_warn("failed to read list element at %#lx", addr); + mdb_warn("failed to read %s at %#lx", lwd->lw_elem_name, addr); return (WALK_ERR); } + if (lwd->lw_elem_check != NULL) { + int rc = lwd->lw_elem_check(lwd->lw_obj, addr, + lwd->lw_elem_check_arg); + if (rc == -1) + return (WALK_ERR); + else if (rc == 1) + return (WALK_DONE); + } + status = wsp->walk_callback(addr, lwd->lw_obj, wsp->walk_cbdata); node = (list_node_t *)((uintptr_t)lwd->lw_obj + lwd->lw_offset); wsp->walk_addr = (uintptr_t)node->list_next; diff --git a/usr/src/cmd/mdb/common/modules/genunix/list.h b/usr/src/cmd/mdb/common/modules/genunix/list.h index 10581cc900..e50ea50806 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/list.h +++ b/usr/src/cmd/mdb/common/modules/genunix/list.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,9 +37,17 @@ extern "C" { #define LIST_WALK_NAME "list" #define LIST_WALK_DESC "walk a linked list" -int list_walk_init(mdb_walk_state_t *wsp); -int list_walk_step(mdb_walk_state_t *wsp); -void list_walk_fini(mdb_walk_state_t *wsp); +extern int list_walk_init(mdb_walk_state_t *wsp); +extern int list_walk_init_named(mdb_walk_state_t *wsp, + const char *, const char *); +extern int list_walk_init_checked(mdb_walk_state_t *wsp, + const char *, const char *, + int (*)(void *, uintptr_t, void *), void *); +extern int list_walk_init_range(mdb_walk_state_t *wsp, uintptr_t, uintptr_t, + const char *, const char *, + int (*)(void *, uintptr_t, void *), void *); +extern int list_walk_step(mdb_walk_state_t *wsp); +extern void list_walk_fini(mdb_walk_state_t *wsp); #ifdef __cplusplus } diff --git a/usr/src/common/avl/avl.c b/usr/src/common/avl/avl.c index 7403e81301..c9727c643b 100644 --- a/usr/src/common/avl/avl.c +++ b/usr/src/common/avl/avl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -808,6 +808,64 @@ avl_remove(avl_tree_t *tree, void *data) } while (parent != NULL); } +#define AVL_REINSERT(tree, obj) \ + avl_remove((tree), (obj)); \ + avl_add((tree), (obj)) + +boolean_t +avl_update_lt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) <= 0)); + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update_gt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) >= 0)); + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update(avl_tree_t *t, void *obj) +{ + void *neighbor; + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + /* * initialize a new AVL tree */ @@ -853,6 +911,12 @@ avl_numnodes(avl_tree_t *tree) return (tree->avl_numnodes); } +boolean_t +avl_is_empty(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes == 0); +} #define CHILDBIT (1L) diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c index 9784131fc6..c3ba5024a4 100644 --- a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -111,7 +111,7 @@ lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) if (beg+size > off && off >= 0) uiobuf->error = uiomove(uaddr+(off-beg), size-(off-beg), - UIO_READ, uiobuf->uiop); + UIO_READ, uiobuf->uiop); uiobuf->beg += size; } @@ -249,11 +249,9 @@ lxpr_unlock(proc_t *p) void lxpr_initnodecache() { - lxpr_node_cache = - kmem_cache_create(LXPRCACHE_NAME, - sizeof (lxpr_node_t), 0, - lxpr_node_constructor, lxpr_node_destructor, NULL, - NULL, NULL, 0); + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); } void @@ -269,10 +267,12 @@ lxpr_node_constructor(void *buf, void *un, int kmflags) lxpr_node_t *lxpnp = buf; vnode_t *vp; - vp = lxpnp->lxpr_vnode = vn_alloc(KM_SLEEP); + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); (void) vn_setops(vp, lxpr_vnodeops); - vp->v_data = (caddr_t)lxpnp; + vp->v_data = lxpnp; return (0); } diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index ada6f01470..4f21f5ffd4 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1148,6 +1148,7 @@ kcf_areq_cache_constructor(void *buf, void *cdrarg, int kmflags) kcf_areq_node_t *areq = (kcf_areq_node_t *)buf; areq->an_type = CRYPTO_ASYNCH; + areq->an_refcnt = 0; mutex_init(&areq->an_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&areq->an_done, NULL, CV_DEFAULT, NULL); cv_init(&areq->an_turn_cv, NULL, CV_DEFAULT, NULL); @@ -1176,6 +1177,7 @@ kcf_context_cache_constructor(void *buf, void *cdrarg, int kmflags) { kcf_context_t *kctx = (kcf_context_t *)buf; + kctx->kc_refcnt = 0; mutex_init(&kctx->kc_in_use_lock, NULL, MUTEX_DEFAULT, NULL); return (0); diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index e1b1abea0d..40d2b7962e 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -184,9 +184,7 @@ sdev_prof_free(struct sdev_node *dv) bzero(&dv->sdev_prof, sizeof (dv->sdev_prof)); } -/* - * sdev_node cache constructor - */ +/* sdev_node cache constructor */ /*ARGSUSED1*/ static int i_sdev_node_ctor(void *buf, void *cfarg, int flag) @@ -194,17 +192,17 @@ i_sdev_node_ctor(void *buf, void *cfarg, int flag) struct sdev_node *dv = (struct sdev_node *)buf; struct vnode *vp; - ASSERT(flag == KM_SLEEP); - bzero(buf, sizeof (struct sdev_node)); + vp = dv->sdev_vnode = vn_alloc(flag); + if (vp == NULL) { + return (-1); + } + vp->v_data = dv; rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL); - dv->sdev_vnode = vn_alloc(KM_SLEEP); - vp = SDEVTOV(dv); - vp->v_data = (caddr_t)dv; return (0); } -/* sdev_node destructor for kmem cache */ +/* sdev_node cache destructor */ /*ARGSUSED1*/ static void i_sdev_node_dtor(void *buf, void *arg) diff --git a/usr/src/uts/common/fs/devfs/devfs_subr.c b/usr/src/uts/common/fs/devfs/devfs_subr.c index d4b789a1d6..2acd964d04 100644 --- a/usr/src/uts/common/fs/devfs/devfs_subr.c +++ b/usr/src/uts/common/fs/devfs/devfs_subr.c @@ -135,9 +135,7 @@ extern dev_info_t *clone_dip; extern major_t clone_major; extern struct dev_ops *ddi_hold_driver(major_t); -/* - * dv_node cache constructor, destructor, can cache creation - */ +/* dev_info node cache constructor */ /*ARGSUSED1*/ static int i_dv_node_ctor(void *buf, void *cfarg, int flag) @@ -146,18 +144,16 @@ i_dv_node_ctor(void *buf, void *cfarg, int flag) struct vnode *vp; bzero(buf, sizeof (struct dv_node)); - - /* initialize persistent parts of dv_node */ + vp = dv->dv_vnode = vn_alloc(flag); + if (vp == NULL) { + return (-1); + } + vp->v_data = dv; rw_init(&dv->dv_contents, NULL, RW_DEFAULT, NULL); - - /* allocate vnode and initialize link back to dv_node */ - dv->dv_vnode = vn_alloc(KM_SLEEP); - vp = DVTOV(dv); - vp->v_data = (caddr_t)dv; return (0); } -/* dev_info node destructor for kmem cache */ +/* dev_info node cache destructor */ /*ARGSUSED1*/ static void i_dv_node_dtor(void *buf, void *arg) @@ -183,7 +179,7 @@ dv_node_cache_init() tsd_create(&devfs_clean_key, NULL); } -/* initialize dev_info node cache */ +/* destroy dev_info node cache */ void dv_node_cache_fini() { @@ -1233,8 +1229,7 @@ dv_filldir(struct dv_node *ddv) pdevi = ddv->dv_devi; if (ndi_devi_config(pdevi, NDI_NO_EVENT) != NDI_SUCCESS) { - dcmn_err3(("dv_filldir: config error %s\n", - ddv->dv_name)); + dcmn_err3(("dv_filldir: config error %s\n", ddv->dv_name)); } ndi_devi_enter(pdevi, &circ); diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index c8ba22e012..ef44a25622 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,28 @@ */ /* + * We want to be able to identify files that are referenced only by the DNLC. + * When adding a reference from the DNLC, call VN_HOLD_DNLC instead of VN_HOLD, + * since multiple DNLC references should only be counted once in v_count. This + * file contains only two(2) calls to VN_HOLD, renamed VN_HOLD_CALLER in the + * hope that no one will mistakenly add a VN_HOLD to this file. (Unfortunately + * it is not possible to #undef VN_HOLD and retain VN_HOLD_CALLER. Ideally a + * Makefile rule would grep uncommented C tokens to check that VN_HOLD is + * referenced only once in this file, to define VN_HOLD_CALLER.) + */ +#define VN_HOLD_CALLER VN_HOLD +#define VN_HOLD_DNLC(vp) { \ + mutex_enter(&(vp)->v_lock); \ + if ((vp)->v_count_dnlc == 0) \ + (vp)->v_count++; \ + (vp)->v_count_dnlc++; \ + mutex_exit(&(vp)->v_lock); \ +} +#define VN_RELE_DNLC(vp) { \ + vn_rele_dnlc(vp); \ +} + +/* * Tunable nc_hashavelen is the average length desired for this chain, from * which the size of the nc_hash table is derived at create time. */ @@ -387,13 +409,14 @@ dnlc_init() * so that it never goes away (VOP_INACTIVE isn't called on it). */ negative_cache_vnode.v_count = 1; + negative_cache_vnode.v_count_dnlc = 0; /* * Initialise kstats - both the old compatability raw kind and * the more extensive named stats. */ ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW, - sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL); + sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL); if (ksp) { ksp->ks_data = (void *) &ncstats; kstat_install(ksp); @@ -433,9 +456,9 @@ dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) if ((ncp = dnlc_get(namlen)) == NULL) return; ncp->dp = dp; - VN_HOLD(dp); + VN_HOLD_DNLC(dp); ncp->vp = vp; - VN_HOLD(vp); + VN_HOLD_DNLC(vp); bcopy(name, ncp->name, namlen + 1); /* name and null */ ncp->hash = hash; hp = &nc_hash[hash & nc_hashmask]; @@ -445,12 +468,11 @@ dnlc_enter(vnode_t *dp, char *name, vnode_t *vp) mutex_exit(&hp->hash_lock); ncstats.dbl_enters++; ncs.ncs_dbl_enters.value.ui64++; - VN_RELE(dp); - VN_RELE(vp); + VN_RELE_DNLC(dp); + VN_RELE_DNLC(vp); dnlc_free(ncp); /* crfree done here */ TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, - "dnlc_enter_end:(%S) %d", - "dbl enter", ncstats.dbl_enters); + "dnlc_enter_end:(%S) %d", "dbl enter", ncstats.dbl_enters); return; } /* @@ -508,9 +530,9 @@ dnlc_update(vnode_t *dp, char *name, vnode_t *vp) return; } ncp->dp = dp; - VN_HOLD(dp); + VN_HOLD_DNLC(dp); ncp->vp = vp; - VN_HOLD(vp); + VN_HOLD_DNLC(vp); bcopy(name, ncp->name, namlen + 1); /* name and null */ ncp->hash = hash; hp = &nc_hash[hash & nc_hashmask]; @@ -521,21 +543,21 @@ dnlc_update(vnode_t *dp, char *name, vnode_t *vp) tvp = tcp->vp; tcp->vp = vp; mutex_exit(&hp->hash_lock); - VN_RELE(tvp); + VN_RELE_DNLC(tvp); ncstats.enters++; ncs.ncs_enters.value.ui64++; TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, "dnlc_update_end:(%S) %d", "done", ncstats.enters); } else { mutex_exit(&hp->hash_lock); - VN_RELE(vp); + VN_RELE_DNLC(vp); ncstats.dbl_enters++; ncs.ncs_dbl_enters.value.ui64++; TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END, "dnlc_update_end:(%S) %d", "dbl enter", ncstats.dbl_enters); } - VN_RELE(dp); + VN_RELE_DNLC(dp); dnlc_free(ncp); /* crfree done here */ return; } @@ -612,7 +634,7 @@ dnlc_lookup(vnode_t *dp, char *name) * put a hold on it. */ vp = ncp->vp; - VN_HOLD(vp); + VN_HOLD_CALLER(vp); /* VN_HOLD 1 of 2 in this file */ mutex_exit(&hp->hash_lock); ncstats.hits++; ncs.ncs_hits.value.ui64++; @@ -620,8 +642,8 @@ dnlc_lookup(vnode_t *dp, char *name) ncs.ncs_neg_hits.value.ui64++; } TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, - "dnlc_lookup_end:%S %d vp %x name %s", - "hit", ncstats.hits, vp, name); + "dnlc_lookup_end:%S %d vp %x name %s", "hit", + ncstats.hits, vp, name); return (vp); } depth++; @@ -631,7 +653,7 @@ dnlc_lookup(vnode_t *dp, char *name) ncstats.misses++; ncs.ncs_misses.value.ui64++; TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END, - "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses, + "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses, NULL, name); return (NULL); } @@ -659,8 +681,8 @@ dnlc_remove(vnode_t *dp, char *name) */ nc_rmhash(ncp); mutex_exit(&hp->hash_lock); - VN_RELE(ncp->vp); - VN_RELE(ncp->dp); + VN_RELE_DNLC(ncp->vp); + VN_RELE_DNLC(ncp->dp); dnlc_free(ncp); return; } @@ -707,7 +729,7 @@ dnlc_purge() /* Release holds on all the vnodes now that we have no locks */ for (i = 0; i < index; i++) { - VN_RELE(nc_rele[i]); + VN_RELE_DNLC(nc_rele[i]); } if (ncp != (ncache_t *)nch) { nch--; /* Do current hash chain again */ @@ -716,9 +738,8 @@ dnlc_purge() } /* - * Purge any cache entries referencing a vnode. - * Exit as soon as the vnode reference count goes to 1, as the caller - * must hold a reference, and the dnlc can therefore have no more. + * Purge any cache entries referencing a vnode. Exit as soon as the dnlc + * reference count goes to zero (the caller still holds a reference). */ void dnlc_purge_vp(vnode_t *vp) @@ -729,7 +750,7 @@ dnlc_purge_vp(vnode_t *vp) vnode_t *nc_rele[DNLC_MAX_RELE]; ASSERT(vp->v_count > 0); - if (vp->v_count == 1) { + if (vp->v_count_dnlc == 0) { return; } @@ -764,11 +785,11 @@ dnlc_purge_vp(vnode_t *vp) /* Release holds on all the vnodes now that we have no locks */ while (index) { - VN_RELE(nc_rele[--index]); + VN_RELE_DNLC(nc_rele[--index]); } - if (vp->v_count == 1) { - return; /* no more dnlc references */ + if (vp->v_count_dnlc == 0) { + return; } if (ncp != (ncache_t *)nch) { @@ -830,7 +851,7 @@ dnlc_purge_vfsp(vfs_t *vfsp, int count) mutex_exit(&nch->hash_lock); /* Release holds on all the vnodes now that we have no locks */ for (i = 0; i < index; i++) { - VN_RELE(nc_rele[i]); + VN_RELE_DNLC(nc_rele[i]); } if (count != 0 && n >= count) { return (n); @@ -889,8 +910,8 @@ dnlc_fs_purge1(vnodeops_t *vop) if (ncp != (ncache_t *)hp) { nc_rmhash(ncp); mutex_exit(&hp->hash_lock); - VN_RELE(ncp->dp); - VN_RELE(vp) + VN_RELE_DNLC(ncp->dp); + VN_RELE_DNLC(vp) dnlc_free(ncp); ncs.ncs_purge_total.value.ui64++; return (1); @@ -932,7 +953,8 @@ dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) bcopy(ncp->name, buf, ncp->namlen); buf[ncp->namlen] = '\0'; pvp = ncp->dp; - VN_HOLD(pvp); + /* VN_HOLD 2 of 2 in this file */ + VN_HOLD_CALLER(pvp); mutex_exit(&nch->hash_lock); return (pvp); } @@ -1101,8 +1123,8 @@ found: */ nc_rmhash(ncp); mutex_exit(&hp->hash_lock); - VN_RELE(vp); - VN_RELE(ncp->dp); + VN_RELE_DNLC(vp); + VN_RELE_DNLC(ncp->dp); dnlc_free(ncp); } while (dnlc_nentries > low_water); @@ -1329,7 +1351,7 @@ ok: * then free this cache */ if ((dcp->dc_num_entries + dcp->dc_num_free) > - dnlc_dir_max_size) { + dnlc_dir_max_size) { mutex_exit(&dcap->dca_lock); dnlc_dir_purge(dcap); kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); @@ -1434,7 +1456,7 @@ ok: dcp = (dircache_t *)dcap->dca_dircache; if (VALID_DIR_CACHE(dcp)) { if ((dcp->dc_num_entries + dcp->dc_num_free) > - dnlc_dir_max_size) { + dnlc_dir_max_size) { mutex_exit(&dcap->dca_lock); dnlc_dir_purge(dcap); kmem_cache_free(dnlc_dir_space_cache, dfp); diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 4739875919..c400652f5a 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -139,6 +139,8 @@ static void fifo_fastturnoff(fifonode_t *); static void fifo_reinit_vp(vnode_t *); +static void fnode_destructor(void *, void *); + /* * Constructor/destructor routines for fifos and pipes. * @@ -168,7 +170,6 @@ static void fifo_reinit_vp(vnode_t *); * deducing the number of fnodes from the total size. Thus, the fnode * constructor does most of the work for the pipe constructor. */ -/*ARGSUSED1*/ static int fnode_constructor(void *buf, void *cdrarg, int kmflags) { @@ -185,7 +186,12 @@ fnode_constructor(void *buf, void *cdrarg, int kmflags) vnode_t *vp; - vp = vn_alloc(KM_SLEEP); + vp = vn_alloc(kmflags); + if (vp == NULL) { + fnp->fn_vnode = NULL; /* mark for destructor */ + fnode_destructor(buf, cdrarg); + return (-1); + } fnp->fn_vnode = vp; fnp->fn_lock = flp; @@ -233,6 +239,10 @@ fnode_destructor(void *buf, void *cdrarg) vnode_t *vp = FTOV(fnp); + if (vp == NULL) { + return; /* constructor failed here */ + } + ASSERT(fnp->fn_mp == NULL); ASSERT(fnp->fn_count == 0); ASSERT(fnp->fn_lock == flp); @@ -831,7 +841,7 @@ fiforemove(fifonode_t *fnp) */ if (fnode != NULL && fnode == fnp && !fnode->fn_nextp && !fnode->fn_backp) { - fifoalloc[idx] = NULL; + fifoalloc[idx] = NULL; } else { for (; fnode; fnode = fnode->fn_nextp) { @@ -919,8 +929,7 @@ fifo_connld(struct vnode **vpp, int flag, cred_t *crp) 0 || (error = fifo_stropen(&vp2, flag, filep->f_cred, 0, 0)) != 0) { #if DEBUG - cmn_err(CE_NOTE, "fifo stropen failed error 0x%x", - error); + cmn_err(CE_NOTE, "fifo stropen failed error 0x%x", error); #endif /* * this will call fifo_close and VN_RELE on vp1 diff --git a/usr/src/uts/common/fs/sockfs/socksctp.c b/usr/src/uts/common/fs/sockfs/socksctp.c index 8f9ca22255..6037b33a3c 100644 --- a/usr/src/uts/common/fs/sockfs/socksctp.c +++ b/usr/src/uts/common/fs/sockfs/socksctp.c @@ -164,6 +164,7 @@ sosctp_sock_constructor(void *buf, void *cdrarg, int kmflags) so->so_discon_ind_mp = NULL; so->so_ux_bound_vp = NULL; so->so_unbind_mp = NULL; + so->so_ops = NULL; so->so_accessvp = NULL; so->so_priv = NULL; @@ -219,7 +220,8 @@ sosctp_sock_destructor(void *buf, void *cdrarg) ASSERT(so->so_discon_ind_mp == NULL); ASSERT(so->so_ux_bound_vp == NULL); ASSERT(so->so_unbind_mp == NULL); - ASSERT(so->so_ops == &sosctp_sonodeops || + ASSERT(so->so_ops == NULL || + so->so_ops == &sosctp_sonodeops || so->so_ops == &sosctp_seq_sonodeops); ASSERT(ss->ss_rxdata == NULL); diff --git a/usr/src/uts/common/fs/sockfs/socksdp.c b/usr/src/uts/common/fs/sockfs/socksdp.c index b8482b90b1..8ee0e808cf 100755 --- a/usr/src/uts/common/fs/sockfs/socksdp.c +++ b/usr/src/uts/common/fs/sockfs/socksdp.c @@ -126,6 +126,7 @@ sosdp_sock_constructor(void *buf, void *cdrarg, int kmflags) so->so_discon_ind_mp = NULL; so->so_ux_bound_vp = NULL; so->so_unbind_mp = NULL; + so->so_ops = NULL; so->so_accessvp = NULL; so->so_priv = NULL; @@ -174,7 +175,7 @@ sosdp_sock_destructor(void *buf, void *cdrarg) ASSERT(so->so_discon_ind_mp == NULL); ASSERT(so->so_ux_bound_vp == NULL); ASSERT(so->so_unbind_mp == NULL); - ASSERT(so->so_ops == &sosdp_sonodeops); + ASSERT(so->so_ops == NULL || so->so_ops == &sosdp_sonodeops); ASSERT(vn_matchops(vp, socksdp_vnodeops)); ASSERT(vp->v_data == (caddr_t)so); @@ -303,7 +304,7 @@ sosdp_create(vnode_t *accessvp, int domain, int type, int protocol, cred_t *cr; dprint(4, ("Inside sosdp_create: domain:%d proto:%d type:%d", - domain, protocol, type)); + domain, protocol, type)); if (is_system_labeled()) { *errorp = EOPNOTSUPP; @@ -782,7 +783,7 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) } if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) - flags |= MSG_DONTWAIT; + flags |= MSG_DONTWAIT; count = uiop->uio_resid; @@ -1109,7 +1110,7 @@ sosdp_setsockopt(struct sonode *so, int level, int option_name, } dprint(2, ("sosdp_setsockopt (%d) - conn %p %d %d \n", - so->so_type, conn, level, option_name)); + so->so_type, conn, level, option_name)); if (conn != NULL) { mutex_exit(&so->so_lock); error = sdp_set_opt(conn, level, option_name, optval, optlen); @@ -1191,12 +1192,12 @@ sosdp_setsockopt(struct sonode *so, int level, int option_name, if (intvalue != 0) { dprintso(so, 1, ("sosdp_setsockopt: setting 0x%x\n", - option_name)); + option_name)); so->so_options |= option_name; } else { dprintso(so, 1, ("sosdp_setsockopt: clearing 0x%x\n", - option_name)); + option_name)); so->so_options &= ~option_name; } break; @@ -1231,7 +1232,7 @@ sosdp_setsockopt(struct sonode *so, int level, int option_name, error == EINVAL) && handled) { dprintso(so, 1, ("sosdp_setsockopt: ignoring error %d " - "for 0x%x\n", error, option_name)); + "for 0x%x\n", error, option_name)); error = 0; } } @@ -1426,7 +1427,7 @@ sdp_sock_xmitted(void *handle, int writeable) struct sonode *so = &ss->ss_so; dprint(4, ("sosdp_sock_xmitted: so:%p priv:%p txq:%d", - (void *)so, so->so_priv, writeable)); + (void *)so, so->so_priv, writeable)); mutex_enter(&so->so_lock); ASSERT(so->so_priv != NULL); /* closed conn */ @@ -1491,7 +1492,7 @@ sdp_sock_connfail(void *handle, int error) struct sonode *so = &ss->ss_so; dprint(3, ("sosdp_conn Failed: so:%p priv:%p", (void *)so, - so->so_priv)); + so->so_priv)); mutex_enter(&so->so_lock); ASSERT(so->so_priv != NULL); /* closed conn */ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index c857c34225..e32334b0c7 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -580,6 +580,13 @@ socktpi_constructor(void *buf, void *cdrarg, int kmflags) struct sonode *so = buf; struct vnode *vp; + vp = so->so_vnode = vn_alloc(kmflags); + if (vp == NULL) { + return (-1); + } + vn_setops(vp, socktpi_vnodeops); + vp->v_data = so; + so->so_direct = NULL; so->so_nl7c_flags = 0; @@ -598,12 +605,6 @@ socktpi_constructor(void *buf, void *cdrarg, int kmflags) so->so_faddr_sa = NULL; so->so_ops = &sotpi_sonodeops; - vp = vn_alloc(KM_SLEEP); - so->so_vnode = vp; - - vn_setops(vp, socktpi_vnodeops); - vp->v_data = (caddr_t)so; - mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); @@ -637,7 +638,7 @@ socktpi_destructor(void *buf, void *cdrarg) ASSERT(so->so_ops == &sotpi_sonodeops); ASSERT(vn_matchops(vp, socktpi_vnodeops)); - ASSERT(vp->v_data == (caddr_t)so); + ASSERT(vp->v_data == so); vn_free(vp); @@ -921,8 +922,7 @@ so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, struct sonode *so2; int error; - dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", - so, soun->sun_path)); + dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", so, soun->sun_path)); error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); if (error) { @@ -1129,8 +1129,7 @@ so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, so->so_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; addr = &so->so_ux_faddr; addrlen = (socklen_t)sizeof (so->so_ux_faddr); - dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", - addrlen, vp)); + dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", addrlen, vp)); VN_RELE(vp); *addrp = addr; *addrlenp = (socklen_t)addrlen; @@ -2038,8 +2037,7 @@ pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) bcopy(addr, &sin, sizeof (sin)); (void) sprintf(buf, "(len %d) %x/%d", - addrlen, ntohl(sin.sin_addr.s_addr), - ntohs(sin.sin_port)); + addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); break; } case AF_INET6: { @@ -2059,8 +2057,7 @@ pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) case AF_UNIX: { struct sockaddr_un *soun = (struct sockaddr_un *)addr; - (void) sprintf(buf, "(len %d) %s", - addrlen, + (void) sprintf(buf, "(len %d) %s", addrlen, (soun == NULL) ? "(none)" : soun->sun_path); break; } diff --git a/usr/src/uts/common/fs/specfs/specsubr.c b/usr/src/uts/common/fs/specfs/specsubr.c index b7158425b7..9ae689c696 100644 --- a/usr/src/uts/common/fs/specfs/specsubr.c +++ b/usr/src/uts/common/fs/specfs/specsubr.c @@ -758,12 +758,12 @@ snode_constructor(void *buf, void *cdrarg, int kmflags) struct snode *sp = buf; struct vnode *vp; - vp = vn_alloc(KM_SLEEP); - - sp->s_vnode = vp; - + vp = sp->s_vnode = vn_alloc(kmflags); + if (vp == NULL) { + return (-1); + } vn_setops(vp, spec_getvnodeops()); - vp->v_data = (caddr_t)sp; + vp->v_data = sp; mutex_init(&sp->s_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&sp->s_cv, NULL, CV_DEFAULT, NULL); diff --git a/usr/src/uts/common/fs/ufs/ufs_inode.c b/usr/src/uts/common/fs/ufs/ufs_inode.c index 09f0d70648..9565079c80 100644 --- a/usr/src/uts/common/fs/ufs/ufs_inode.c +++ b/usr/src/uts/common/fs/ufs/ufs_inode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -257,6 +257,13 @@ ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) struct inode *ip = buf; struct vnode *vp; + vp = ip->i_vnode = vn_alloc(kmflags); + if (vp == NULL) { + return (-1); + } + vn_setops(vp, ufs_vnodeops); + vp->v_data = ip; + rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); @@ -264,12 +271,6 @@ ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); - vp = vn_alloc(KM_SLEEP); - ip->i_vnode = vp; - - vn_setops(vp, ufs_vnodeops); - vp->v_data = (caddr_t)ip; - return (0); } @@ -284,7 +285,6 @@ ufs_inode_cache_destructor(void *buf, void *cdrarg) rw_destroy(&ip->i_rwlock); rw_destroy(&ip->i_contents); - mutex_destroy(&ip->i_tlock); if (vp->v_type == VDIR) { dnlc_dir_fini(&ip->i_danchor); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 568df766cf..1a885bf32e 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -814,16 +814,37 @@ done: void vn_rele(vnode_t *vp) { - if (vp->v_count == 0) - cmn_err(CE_PANIC, "vn_rele: vnode ref count 0"); + VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); - } else { + return; + } + vp->v_count--; + mutex_exit(&vp->v_lock); +} + +/* + * Release a vnode referenced by the DNLC. Multiple DNLC references are treated + * as a single reference, so v_count is not decremented until the last DNLC hold + * is released. This makes it possible to distinguish vnodes that are referenced + * only by the DNLC. + */ +void +vn_rele_dnlc(vnode_t *vp) +{ + VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); + mutex_enter(&vp->v_lock); + if (--vp->v_count_dnlc == 0) { + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } vp->v_count--; - mutex_exit(&vp->v_lock); } + mutex_exit(&vp->v_lock); } /* @@ -836,17 +857,16 @@ vn_rele(vnode_t *vp) void vn_rele_stream(vnode_t *vp) { - if (vp->v_count == 0) - cmn_err(CE_PANIC, "vn_rele: vnode ref count 0"); + VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); vp->v_stream = NULL; if (vp->v_count == 1) { mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); - } else { - vp->v_count--; - mutex_exit(&vp->v_lock); + return; } + vp->v_count--; + mutex_exit(&vp->v_lock); } int @@ -2190,7 +2210,6 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags) mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); - rw_init(&vp->v_mslock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ vp->v_path = NULL; vp->v_mpssdata = NULL; @@ -2208,7 +2227,6 @@ vn_cache_destructor(void *buf, void *cdrarg) vp = buf; - rw_destroy(&vp->v_mslock); rw_destroy(&vp->v_nbllock); cv_destroy(&vp->v_cv); mutex_destroy(&vp->v_lock); @@ -2284,6 +2302,7 @@ void vn_reinit(vnode_t *vp) { vp->v_count = 1; + vp->v_count_dnlc = 0; vp->v_vfsp = NULL; vp->v_stream = NULL; vp->v_vfsmountedhere = NULL; @@ -2294,17 +2313,8 @@ vn_reinit(vnode_t *vp) vp->v_filocks = NULL; vp->v_shrlocks = NULL; vp->v_pages = NULL; - vp->v_npages = 0; - vp->v_msnpages = 0; - vp->v_scanfront = NULL; - vp->v_scanback = NULL; vp->v_locality = NULL; - vp->v_scantime = 0; - vp->v_mset = 0; - vp->v_msflags = 0; - vp->v_msnext = NULL; - vp->v_msprev = NULL; vp->v_xattrdir = NULL; /* Handles v_femhead, v_path, and the r/w/map counts */ @@ -2339,6 +2349,7 @@ vn_free(vnode_t *vp) * never be anything else. */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); + ASSERT(vp->v_count_dnlc == 0); if (vp->v_path != NULL) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); vp->v_path = NULL; @@ -2573,6 +2584,15 @@ vn_mountedvfs(vnode_t *vp) } /* + * Return nonzero if the vnode is referenced by the dnlc, zero if not. + */ +int +vn_in_dnlc(vnode_t *vp) +{ + return (vp->v_count_dnlc > 0); +} + +/* * vn_has_other_opens() checks whether a particular file is opened by more than * just the caller and whether the open is for read and/or write. * This routine is for calling after the caller has already called VOP_OPEN() diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index 438dcbdd55..d02177bce9 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -257,19 +257,19 @@ typedef struct znode { * Macros for dealing with dmu_buf_hold */ #define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) -#define ZFS_OBJ_MUTEX(zp) \ - (&(zp)->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH((zp)->z_id)]) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) #define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ - mutex_enter(&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]); + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ - mutex_exit(&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) /* * Macros to encode/decode ZFS stored time values from/to struct timespec */ #define ZFS_TIME_ENCODE(tp, stmp) \ { \ - (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 949ef4a1fe..68be58e8a3 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -67,12 +67,29 @@ #include "zfs_prop.h" /* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +/* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL -struct kmem_cache *znode_cache = NULL; +static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ static void @@ -87,12 +104,20 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) /*ARGSUSED*/ static int -zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; - zp->z_vnode = vn_alloc(KM_SLEEP); - zp->z_vnode->v_data = (caddr_t)zp; + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + + zp->z_vnode = vn_alloc(kmflags); + if (zp->z_vnode == NULL) { + return (-1); + } + ZTOV(zp)->v_data = zp; + + list_link_init(&zp->z_link_node); + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); @@ -104,17 +129,20 @@ zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) sizeof (rl_t), offsetof(rl_t, r_node)); zp->z_dbuf = NULL; - zp->z_dirlocks = 0; + zp->z_dirlocks = NULL; return (0); } /*ARGSUSED*/ static void -zfs_znode_cache_destructor(void *buf, void *cdarg) +zfs_znode_cache_destructor(void *buf, void *arg) { znode_t *zp = buf; - ASSERT(zp->z_dirlocks == 0); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT(ZTOV(zp)->v_data == zp); + vn_free(ZTOV(zp)); + ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); @@ -124,8 +152,158 @@ zfs_znode_cache_destructor(void *buf, void *cdarg) mutex_destroy(&zp->z_range_lock); ASSERT(zp->z_dbuf == NULL); - ASSERT(ZTOV(zp)->v_count == 0); - vn_free(ZTOV(zp)); + ASSERT(zp->z_dirlocks == NULL); +} + +#ifdef ZNODE_STATS +static struct { + uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_unmounted; + uint64_t zms_zfsvfs_recheck_invalid; + uint64_t zms_vnode_locked; + uint64_t zms_znode_in_use; + uint64_t zms_yes; + uint64_t zms_later; + uint64_t zms_dont_know; +} znode_move_stats; +#endif /* ZNODE_STATS */ + +static void +zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) +{ + vnode_t *vp; + + /* Copy fields. */ + nzp->z_zfsvfs = ozp->z_zfsvfs; + + /* Swap vnodes. */ + vp = nzp->z_vnode; + nzp->z_vnode = ozp->z_vnode; + ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ + ZTOV(ozp)->v_data = ozp; + ZTOV(nzp)->v_data = nzp; + + nzp->z_id = ozp->z_id; + ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ + ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); + nzp->z_unlinked = ozp->z_unlinked; + nzp->z_atime_dirty = ozp->z_atime_dirty; + nzp->z_zn_prefetch = ozp->z_zn_prefetch; + nzp->z_blksz = ozp->z_blksz; + nzp->z_seq = ozp->z_seq; + nzp->z_mapcnt = ozp->z_mapcnt; + nzp->z_last_itx = ozp->z_last_itx; + nzp->z_gen = ozp->z_gen; + nzp->z_sync_cnt = ozp->z_sync_cnt; + nzp->z_phys = ozp->z_phys; + nzp->z_dbuf = ozp->z_dbuf; + + /* Update back pointers. */ + (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, + znode_evict_error); + + /* + * Invalidate the original znode by clearing fields that provide a + * pointer back to the znode. Set the low bit of the vfs pointer to + * ensure that zfs_znode_move() recognizes the znode as invalid in any + * subsequent callback. + */ + ozp->z_dbuf = NULL; + POINTER_INVALIDATE(&ozp->z_zfsvfs); +} + +/* + * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise + * returns a non-zero error code. + */ +static int +zfs_enter(zfsvfs_t *zfsvfs) +{ + ZFS_ENTER(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static kmem_cbrc_t +zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + znode_t *ozp = buf, *nzp = newbuf; + zfsvfs_t *zfsvfs; + vnode_t *vp; + + /* + * The znode is on the file system's list of known znodes if the vfs + * pointer is valid. We set the low bit of the vfs pointer when freeing + * the znode to invalidate it, and the memory patterns written by kmem + * (baddcafe and deadbeef) set at least one of the two low bits. A newly + * created znode sets the vfs pointer last of all to indicate that the + * znode is known and in a valid state to be moved by this function. + */ + zfsvfs = ozp->z_zfsvfs; + if (!POINTER_IS_VALID(zfsvfs)) { + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); + ZNODE_STAT_ADD(znode_move_stats.zms_dont_know); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the filesystem is not unmounted during the move. + */ + if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); + ZNODE_STAT_ADD(znode_move_stats.zms_dont_know); + return (KMEM_CBRC_DONT_KNOW); + } + + mutex_enter(&zfsvfs->z_znodes_lock); + /* + * Recheck the vfs pointer in case the znode was removed just before + * acquiring the lock. + */ + if (zfsvfs != ozp->z_zfsvfs) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); + ZNODE_STAT_ADD(znode_move_stats.zms_dont_know); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold z_znodes_lock, the + * znode cannot be freed and fields within the znode can be safely + * accessed. + */ + vp = ZTOV(ozp); + if (mutex_tryenter(&vp->v_lock) == 0) { + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); + ZNODE_STAT_ADD(znode_move_stats.zms_later); + return (KMEM_CBRC_LATER); + } + /* Only move znodes that are referenced _only_ by the DNLC. */ + if (vp->v_count != 1 || !vn_in_dnlc(vp)) { + mutex_exit(&vp->v_lock); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + ZNODE_STAT_ADD(znode_move_stats.zms_znode_in_use); + ZNODE_STAT_ADD(znode_move_stats.zms_later); + return (KMEM_CBRC_LATER); + } + + /* + * The znode is known and in a valid state to move. We're holding the + * locks needed to execute the critical section. + */ + zfs_znode_move_impl(ozp, nzp); + mutex_exit(&vp->v_lock); + + list_link_replace(&ozp->z_link_node, &nzp->z_link_node); + mutex_exit(&zfsvfs->z_znodes_lock); + ZFS_EXIT(zfsvfs); + + ZNODE_STAT_ADD(znode_move_stats.zms_yes); + return (KMEM_CBRC_YES); } void @@ -138,6 +316,7 @@ zfs_znode_init(void) znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + kmem_cache_set_move(znode_cache, zfs_znode_move); } void @@ -419,12 +598,12 @@ zfs_cmpldev(uint64_t dev) } static void -zfs_znode_dmu_init(znode_t *zp, dmu_buf_t *db) +zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) { znode_t *nzp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp))); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); mutex_enter(&zp->z_lock); @@ -453,7 +632,8 @@ void zfs_znode_dmu_fini(znode_t *zp) { dmu_buf_t *db = zp->z_dbuf; - ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp)) || zp->z_unlinked || + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); ASSERT(zp->z_dbuf != NULL); zp->z_dbuf = NULL; @@ -478,9 +658,13 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_dbuf == NULL); + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ zp->z_phys = NULL; - zp->z_zfsvfs = zfsvfs; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; @@ -493,14 +677,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) vp = ZTOV(zp); vn_reinit(vp); - zfs_znode_dmu_init(zp, db); + zfs_znode_dmu_init(zfsvfs, zp, db); zp->z_gen = zp->z_phys->zp_gen; - mutex_enter(&zfsvfs->z_znodes_lock); - list_insert_tail(&zfsvfs->z_all_znodes, zp); - mutex_exit(&zfsvfs->z_znodes_lock); - vp->v_vfsp = zfsvfs->z_parent->z_vfs; vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); @@ -535,6 +715,16 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) break; } + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + VFS_HOLD(zfsvfs->z_vfs); return (zp); } @@ -675,7 +865,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); if (!(flag & IS_ROOT_NODE)) { - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj) + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); *zpp = zfs_znode_alloc(zfsvfs, db, 0); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } else { @@ -843,7 +1033,7 @@ zfs_rezget(znode_t *zp) return (EIO); } - zfs_znode_dmu_init(zp, db); + zfs_znode_dmu_init(zfsvfs, zp, db); zp->z_unlinked = (zp->z_phys->zp_links == 0); zp->z_blksz = doi.doi_data_block_size; @@ -928,7 +1118,10 @@ zfs_znode_free(znode_t *zp) vn_invalid(ZTOV(zp)); + ASSERT(ZTOV(zp)->v_count == 0); + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); @@ -1273,7 +1466,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); - rootzp->z_zfsvfs = &zfsvfs; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; @@ -1300,10 +1492,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_zfsvfs = &zfsvfs; zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); ASSERT3P(zp, ==, rootzp); + ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); ZTOV(rootzp)->v_count = 0; dmu_buf_rele(rootzp->z_dbuf, NULL); diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c index afa613603d..529510fc18 100644 --- a/usr/src/uts/common/inet/sctp/sctp_common.c +++ b/usr/src/uts/common/inet/sctp/sctp_common.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1852,11 +1852,8 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr, fp->timer_mp = timer_mp; fp->hb_pending = B_FALSE; fp->hb_enabled = B_TRUE; - fp->timer_running = 0; fp->df = 1; fp->pmtu_discovered = 0; - fp->rc_timer_mp = NULL; - fp->rc_timer_running = 0; fp->next = NULL; fp->ire = NULL; fp->T3expire = 0; @@ -1869,8 +1866,23 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr, } /*ARGSUSED*/ +static int +faddr_constructor(void *buf, void *arg, int flags) +{ + sctp_faddr_t *fp = buf; + + fp->timer_mp = NULL; + fp->timer_running = 0; + + fp->rc_timer_mp = NULL; + fp->rc_timer_running = 0; + + return (0); +} + +/*ARGSUSED*/ static void -faddr_destructor(void *buf, void *cdrarg) +faddr_destructor(void *buf, void *arg) { sctp_faddr_t *fp = buf; @@ -1885,7 +1897,7 @@ void sctp_faddr_init(void) { sctp_kmem_faddr_cache = kmem_cache_create("sctp_faddr_cache", - sizeof (sctp_faddr_t), 0, NULL, faddr_destructor, + sizeof (sctp_faddr_t), 0, faddr_constructor, faddr_destructor, NULL, NULL, NULL, 0); } diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index 90fbf3cbf1..d8dad37e58 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -151,13 +151,13 @@ static size_t dblk_sizes[] = { #ifdef _LP64 - 16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3920, - 8192, 12112, 16384, 20304, 24576, 28496, 32768, 36688, - 40960, 44880, 49152, 53072, 57344, 61264, 65536, 69456, + 16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856, + 8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624, + 40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392, #else - 64, 128, 320, 576, 1088, 1536, 1984, 2624, 3968, - 8192, 12160, 16384, 20352, 24576, 28544, 32768, 36736, - 40960, 44928, 49152, 53120, 57344, 61312, 65536, 69504, + 64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904, + 8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672, + 40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440, #endif DBLK_MAX_CACHE, 0 }; diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index 8ca8ac861c..ea627b218a 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -26,7 +26,8 @@ #pragma ident "%Z%%M% %I% %E% SMI" /* - * Kernel memory allocator, as described in the following two papers: + * Kernel memory allocator, as described in the following two papers and a + * statement about the consolidator: * * Jeff Bonwick, * The Slab Allocator: An Object-Caching Kernel Memory Allocator. @@ -38,6 +39,768 @@ * Arbitrary Resources. * Proceedings of the 2001 Usenix Conference. * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. + * + * kmem Slab Consolidator Big Theory Statement: + * + * 1. Motivation + * + * As stated in Bonwick94, slabs provide the following advantages over other + * allocation structures in terms of memory fragmentation: + * + * - Internal fragmentation (per-buffer wasted space) is minimal. + * - Severe external fragmentation (unused buffers on the free list) is + * unlikely. + * + * Segregating objects by size eliminates one source of external fragmentation, + * and according to Bonwick: + * + * The other reason that slabs reduce external fragmentation is that all + * objects in a slab are of the same type, so they have the same lifetime + * distribution. The resulting segregation of short-lived and long-lived + * objects at slab granularity reduces the likelihood of an entire page being + * held hostage due to a single long-lived allocation [Barrett93, Hanson90]. + * + * While unlikely, severe external fragmentation remains possible. Clients that + * allocate both short- and long-lived objects from the same cache cannot + * anticipate the distribution of long-lived objects within the allocator's slab + * implementation. Even a small percentage of long-lived objects distributed + * randomly across many slabs can lead to a worst case scenario where the client + * frees the majority of its objects and the system gets back almost none of the + * slabs. Despite the client doing what it reasonably can to help the system + * reclaim memory, the allocator cannot shake free enough slabs because of + * lonely allocations stubbornly hanging on. Although the allocator is in a + * position to diagnose the fragmentation, there is nothing that the allocator + * by itself can do about it. It only takes a single allocated object to prevent + * an entire slab from being reclaimed, and any object handed out by + * kmem_cache_alloc() is by definition in the client's control. Conversely, + * although the client is in a position to move a long-lived object, it has no + * way of knowing if the object is causing fragmentation, and if so, where to + * move it. A solution necessarily requires further cooperation between the + * allocator and the client. + * + * 2. Move Callback + * + * The kmem slab consolidator therefore adds a move callback to the + * allocator/client interface, improving worst-case external fragmentation in + * kmem caches that supply a function to move objects from one memory location + * to another. In a situation of low memory kmem attempts to consolidate all of + * a cache's slabs at once; otherwise it works slowly to bring external + * fragmentation within the 1/8 limit guaranteed for internal fragmentation, + * thereby helping to avoid a low memory situation in the future. + * + * The callback has the following signature: + * + * kmem_cbrc_t move(void *old, void *new, size_t size, void *user_arg) + * + * It supplies the kmem client with two addresses: the allocated object that + * kmem wants to move and a buffer selected by kmem for the client to use as the + * copy destination. The callback is kmem's way of saying "Please get off of + * this buffer and use this one instead." kmem knows where it wants to move the + * object in order to best reduce fragmentation. All the client needs to know + * about the second argument (void *new) is that it is an allocated, constructed + * object ready to take the contents of the old object. When the move function + * is called, the system is likely to be low on memory, and the new object + * spares the client from having to worry about allocating memory for the + * requested move. The third argument supplies the size of the object, in case a + * single move function handles multiple caches whose objects differ only in + * size (such as zio_buf_512, zio_buf_1024, etc). Finally, the same optional + * user argument passed to the constructor, destructor, and reclaim functions is + * also passed to the move callback. + * + * 2.1 Setting the Move Callback + * + * The client sets the move callback after creating the cache and before + * allocating from it: + * + * object_cache = kmem_cache_create(...); + * kmem_cache_set_move(object_cache, object_move); + * + * 2.2 Move Callback Return Values + * + * Only the client knows about its own data and when is a good time to move it. + * The client is cooperating with kmem to return unused memory to the system, + * and kmem respectfully accepts this help at the client's convenience. When + * asked to move an object, the client can respond with any of the following: + * + * typedef enum kmem_cbrc { + * KMEM_CBRC_YES, + * KMEM_CBRC_NO, + * KMEM_CBRC_LATER, + * KMEM_CBRC_DONT_NEED, + * KMEM_CBRC_DONT_KNOW + * } kmem_cbrc_t; + * + * The client must not explicitly kmem_cache_free() either of the objects passed + * to the callback, since kmem wants to free them directly to the slab layer + * (bypassing the per-CPU magazine layer). The response tells kmem which of the + * objects to free: + * + * YES: (Did it) The client moved the object, so kmem frees the old one. + * NO: (Never) The client refused, so kmem frees the new object (the + * unused copy destination). kmem also marks the slab of the old + * object so as not to bother the client with further callbacks for + * that object as long as the slab remains on the partial slab list. + * (The system won't be getting the slab back as long as the + * immovable object holds it hostage, so there's no point in moving + * any of its objects.) + * LATER: The client is using the object and cannot move it now, so kmem + * frees the new object (the unused copy destination). kmem still + * attempts to move other objects off the slab, since it expects to + * succeed in clearing the slab in a later callback. The client + * should use LATER instead of NO if the object is likely to become + * movable very soon. + * DONT_NEED: The client no longer needs the object, so kmem frees the old along + * with the new object (the unused copy destination). This response + * is the client's opportunity to be a model citizen and give back as + * much as it can. + * DONT_KNOW: The client does not know about the object because + * a) the client has just allocated the object and not yet put it + * wherever it expects to find known objects + * b) the client has removed the object from wherever it expects to + * find known objects and is about to free it, or + * c) the client has freed the object. + * In all these cases (a, b, and c) kmem frees the new object (the + * unused copy destination) and searches for the old object in the + * magazine layer. If found, the object is removed from the magazine + * layer and freed to the slab layer so it will no longer hold the + * slab hostage. + * + * 2.3 Object States + * + * Neither kmem nor the client can be assumed to know the object's whereabouts + * at the time of the callback. An object belonging to a kmem cache may be in + * any of the following states: + * + * 1. Uninitialized on the slab + * 2. Allocated from the slab but not constructed (still uninitialized) + * 3. Allocated from the slab, constructed, but not yet ready for business + * (not in a valid state for the move callback) + * 4. In use (valid and known to the client) + * 5. About to be freed (no longer in a valid state for the move callback) + * 6. Freed to a magazine (still constructed) + * 7. Allocated from a magazine, not yet ready for business (not in a valid + * state for the move callback), and about to return to state #4 + * 8. Deconstructed on a magazine that is about to be freed + * 9. Freed to the slab + * + * Since the move callback may be called at any time while the object is in any + * of the above states (except state #1), the client needs a safe way to + * determine whether or not it knows about the object. Specifically, the client + * needs to know whether or not the object is in state #4, the only state in + * which a move is valid. If the object is in any other state, the client should + * immediately return KMEM_CBRC_DONT_KNOW, since it is unsafe to access any of + * the object's fields. + * + * Note that although an object may be in state #4 when kmem initiates the move + * request, the object may no longer be in that state by the time kmem actually + * calls the move function. Not only does the client free objects + * asynchronously, kmem itself puts move requests on a queue where thay are + * pending until kmem processes them from another context. Also, objects freed + * to a magazine appear allocated from the point of view of the slab layer, so + * kmem may even initiate requests for objects in a state other than state #4. + * + * 2.3.1 Magazine Layer + * + * An important insight revealed by the states listed above is that the magazine + * layer is populated only by kmem_cache_free(). Magazines of constructed + * objects are never populated directly from the slab layer (which contains raw, + * unconstructed objects). Whenever an allocation request cannot be satisfied + * from the magazine layer, the magazines are bypassed and the request is + * satisfied from the slab layer (creating a new slab if necessary). kmem calls + * the object constructor only when allocating from the slab layer, and only in + * response to kmem_cache_alloc() or to prepare the destination buffer passed in + * the move callback. kmem does not preconstruct objects in anticipation of + * kmem_cache_alloc(). + * + * 2.3.2 Object Constructor and Destructor + * + * If the client supplies a destructor, it must be valid to call the destructor + * on a newly created object (immediately after the constructor). + * + * 2.4 Recognizing Known Objects + * + * There is a simple test to determine safely whether or not the client knows + * about a given object in the move callback. It relies on the fact that kmem + * guarantees that the object of the move callback has only been touched by the + * client itself or else by kmem. kmem does this by ensuring that none of the + * cache's slabs are freed to the virtual memory (VM) subsystem while a move + * callback is pending. When the last object on a slab is freed, if there is a + * pending move, kmem puts the slab on a per-cache dead list and defers freeing + * slabs on that list until all pending callbacks are completed. That way, + * clients can be certain that the object of a move callback is in one of the + * states listed above, making it possible to distinguish known objects (in + * state #4) using the two low order bits of any pointer member (with the + * exception of 'char *' or 'short *' which may not be 4-byte aligned on some + * platforms). + * + * The test works as long as the client always transitions objects from state #4 + * (known, in use) to state #5 (about to be freed, invalid) by setting the low + * order bit of the client-designated pointer member. Since kmem only writes + * invalid memory patterns, such as 0xbaddcafe to uninitialized memory and + * 0xdeadbeef to freed memory, any scribbling on the object done by kmem is + * guaranteed to set at least one of the two low order bits. Therefore, given an + * object with a back pointer to a 'container_t *o_container', the client can + * test + * + * container_t *container = object->o_container; + * if ((uintptr_t)container & 0x3) { + * return (KMEM_CBRC_DONT_KNOW); + * } + * + * Typically, an object will have a pointer to some structure with a list or + * hash where objects from the cache are kept while in use. Assuming that the + * client has some way of knowing that the container structure is valid and will + * not go away during the move, and assuming that the structure includes a lock + * to protect whatever collection is used, then the client would continue as + * follows: + * + * // Ensure that the container structure does not go away. + * if (container_hold(container) == 0) { + * return (KMEM_CBRC_DONT_KNOW); + * } + * mutex_enter(&container->c_objects_lock); + * if (container != object->o_container) { + * mutex_exit(&container->c_objects_lock); + * container_rele(container); + * return (KMEM_CBRC_DONT_KNOW); + * } + * + * At this point the client knows that the object cannot be freed as long as + * c_objects_lock is held. Note that after acquiring the lock, the client must + * recheck the o_container pointer in case the object was removed just before + * acquiring the lock. + * + * When the client is about to free an object, it must first remove that object + * from the list, hash, or other structure where it is kept. At that time, to + * mark the object so it can be distinguished from the remaining, known objects, + * the client sets the designated low order bit: + * + * mutex_enter(&container->c_objects_lock); + * object->o_container = (void *)((uintptr_t)object->o_container | 0x1); + * list_remove(&container->c_objects, object); + * mutex_exit(&container->c_objects_lock); + * + * In the common case, the object is freed to the magazine layer, where it may + * be reused on a subsequent allocation without the overhead of calling the + * constructor. While in the magazine it appears allocated from the point of + * view of the slab layer, making it a candidate for the move callback. Most + * objects unrecognized by the client in the move callback fall into this + * category and are cheaply distinguished from known objects by the test + * described earlier. Since recognition is cheap for the client, and searching + * magazines is expensive for kmem, kmem defers searching until the client first + * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem + * elsewhere does what it can to avoid bothering the client unnecessarily. + * + * Invalidating the designated pointer member before freeing the object marks + * the object to be avoided in the callback, and conversely, assigning a valid + * value to the designated pointer member after allocating the object makes the + * object fair game for the callback: + * + * ... allocate object ... + * ... set any initial state not set by the constructor ... + * + * mutex_enter(&container->c_objects_lock); + * list_insert_tail(&container->c_objects, object); + * membar_producer(); + * object->o_container = container; + * mutex_exit(&container->c_objects_lock); + * + * Note that everything else must be valid before setting o_container makes the + * object fair game for the move callback. The membar_producer() call ensures + * that all the object's state is written to memory before setting the pointer + * that transitions the object from state #3 or #7 (allocated, constructed, not + * yet in use) to state #4 (in use, valid). That's important because the move + * function has to check the validity of the pointer before it can safely + * acquire the lock protecting the collection where it expects to find known + * objects. + * + * This method of distinguishing known objects observes the usual symmetry: + * invalidating the designated pointer is the first thing the client does before + * freeing the object, and setting the designated pointer is the last thing the + * client does after allocating the object. Of course, the client is not + * required to use this method. Fundamentally, how the client recognizes known + * objects is completely up to the client, but this method is recommended as an + * efficient and safe way to take advantage of the guarantees made by kmem. If + * the entire object is arbitrary data without any markable bits from a suitable + * pointer member, then the client must find some other method, such as + * searching a hash table of known objects. + * + * 2.5 Preventing Objects From Moving + * + * Besides a way to distinguish known objects, the other thing that the client + * needs is a strategy to ensure that an object will not move while the client + * is actively using it. The details of satisfying this requirement tend to be + * highly cache-specific. It might seem that the same rules that let a client + * remove an object safely should also decide when an object can be moved + * safely. However, any object state that makes a removal attempt invalid is + * likely to be long-lasting for objects that the client does not expect to + * remove. kmem knows nothing about the object state and is equally likely (from + * the client's point of view) to request a move for any object in the cache, + * whether prepared for removal or not. Even a low percentage of objects stuck + * in place by unremovability will defeat the consolidator if the stuck objects + * are the same long-lived allocations likely to hold slabs hostage. + * Fundamentally, the consolidator is not aimed at common cases. Severe external + * fragmentation is a worst case scenario manifested as sparsely allocated + * slabs, by definition a low percentage of the cache's objects. When deciding + * what makes an object movable, keep in mind the goal of the consolidator: to + * bring worst-case external fragmentation within the limits guaranteed for + * internal fragmentation. Removability is a poor criterion if it is likely to + * exclude more than an insignificant percentage of objects for long periods of + * time. + * + * A tricky general solution exists, and it has the advantage of letting you + * move any object at almost any moment, practically eliminating the likelihood + * that an object can hold a slab hostage. However, if there is a cache-specific + * way to ensure that an object is not actively in use in the vast majority of + * cases, a simpler solution that leverages this cache-specific knowledge is + * preferred. + * + * 2.5.1 Cache-Specific Solution + * + * As an example of a cache-specific solution, the ZFS znode cache takes + * advantage of the fact that the vast majority of znodes are only being + * referenced from the DNLC. (A typical case might be a few hundred in active + * use and a hundred thousand in the DNLC.) In the move callback, after the ZFS + * client has established that it recognizes the znode and can access its fields + * safely (using the method described earlier), it then tests whether the znode + * is referenced by anything other than the DNLC. If so, it assumes that the + * znode may be in active use and is unsafe to move, so it drops its locks and + * returns KMEM_CBRC_LATER. The advantage of this strategy is that everywhere + * else znodes are used, no change is needed to protect against the possibility + * of the znode moving. The disadvantage is that it remains possible for an + * application to hold a znode slab hostage with an open file descriptor. + * However, this case ought to be rare and the consolidator has a way to deal + * with it: If the client responds KMEM_CBRC_LATER repeatedly for the same + * object, kmem eventually stops believing it and treats the slab as if the + * client had responded KMEM_CBRC_NO. Having marked the hostage slab, kmem can + * then focus on getting it off of the partial slab list by allocating rather + * than freeing all of its objects. (Either way of getting a slab off the + * free list reduces fragmentation.) + * + * 2.5.2 General Solution + * + * The general solution, on the other hand, requires an explicit hold everywhere + * the object is used to prevent it from moving. To keep the client locking + * strategy as uncomplicated as possible, kmem guarantees the simplifying + * assumption that move callbacks are sequential, even across multiple caches. + * Internally, a global queue processed by a single thread supports all caches + * implementing the callback function. No matter how many caches supply a move + * function, the consolidator never moves more than one object at a time, so the + * client does not have to worry about tricky lock ordering involving several + * related objects from different kmem caches. + * + * The general solution implements the explicit hold as a read-write lock, which + * allows multiple readers to access an object from the cache simultaneously + * while a single writer is excluded from moving it. A single rwlock for the + * entire cache would lock out all threads from using any of the cache's objects + * even though only a single object is being moved, so to reduce contention, + * the client can fan out the single rwlock into an array of rwlocks hashed by + * the object address, making it probable that moving one object will not + * prevent other threads from using a different object. The rwlock cannot be a + * member of the object itself, because the possibility of the object moving + * makes it unsafe to access any of the object's fields until the lock is + * acquired. + * + * Assuming a small, fixed number of locks, it's possible that multiple objects + * will hash to the same lock. A thread that needs to use multiple objects in + * the same function may acquire the same lock multiple times. Since rwlocks are + * reentrant for readers, and since there is never more than a single writer at + * a time (assuming that the client acquires the lock as a writer only when + * moving an object inside the callback), there would seem to be no problem. + * However, a client locking multiple objects in the same function must handle + * one case of potential deadlock: Assume that thread A needs to prevent both + * object 1 and object 2 from moving, and thread B, the callback, meanwhile + * tries to move object 3. It's possible, if objects 1, 2, and 3 all hash to the + * same lock, that thread A will acquire the lock for object 1 as a reader + * before thread B sets the lock's write-wanted bit, preventing thread A from + * reacquiring the lock for object 2 as a reader. Unable to make forward + * progress, thread A will never release the lock for object 1, resulting in + * deadlock. + * + * There are two ways of avoiding the deadlock just described. The first is to + * use rw_tryenter() rather than rw_enter() in the callback function when + * attempting to acquire the lock as a writer. If tryenter discovers that the + * same object (or another object hashed to the same lock) is already in use, it + * aborts the callback and returns KMEM_CBRC_LATER. The second way is to use + * rprwlock_t (declared in common/fs/zfs/sys/rprwlock.h) instead of rwlock_t, + * since it allows a thread to acquire the lock as a reader in spite of a + * waiting writer. This second approach insists on moving the object now, no + * matter how many readers the move function must wait for in order to do so, + * and could delay the completion of the callback indefinitely (blocking + * callbacks to other clients). In practice, a less insistent callback using + * rw_tryenter() returns KMEM_CBRC_LATER infrequently enough that there seems + * little reason to use anything else. + * + * Avoiding deadlock is not the only problem that an implementation using an + * explicit hold needs to solve. Locking the object in the first place (to + * prevent it from moving) remains a problem, since the object could move + * between the time you obtain a pointer to the object and the time you acquire + * the rwlock hashed to that pointer value. Therefore the client needs to + * recheck the value of the pointer after acquiring the lock, drop the lock if + * the value has changed, and try again. This requires a level of indirection: + * something that points to the object rather than the object itself, that the + * client can access safely while attempting to acquire the lock. (The object + * itself cannot be referenced safely because it can move at any time.) + * The following lock-acquisition function takes whatever is safe to reference + * (arg), follows its pointer to the object (using function f), and tries as + * often as necessary to acquire the hashed lock and verify that the object + * still has not moved: + * + * object_t * + * object_hold(object_f f, void *arg) + * { + * object_t *op; + * + * op = f(arg); + * if (op == NULL) { + * return (NULL); + * } + * + * rw_enter(OBJECT_RWLOCK(op), RW_READER); + * while (op != f(arg)) { + * rw_exit(OBJECT_RWLOCK(op)); + * op = f(arg); + * if (op == NULL) { + * break; + * } + * rw_enter(OBJECT_RWLOCK(op), RW_READER); + * } + * + * return (op); + * } + * + * The OBJECT_RWLOCK macro hashes the object address to obtain the rwlock. The + * lock reacquisition loop, while necessary, almost never executes. The function + * pointer f (used to obtain the object pointer from arg) has the following type + * definition: + * + * typedef object_t *(*object_f)(void *arg); + * + * An object_f implementation is likely to be as simple as accessing a structure + * member: + * + * object_t * + * s_object(void *arg) + * { + * something_t *sp = arg; + * return (sp->s_object); + * } + * + * The flexibility of a function pointer allows the path to the object to be + * arbitrarily complex and also supports the notion that depending on where you + * are using the object, you may need to get it from someplace different. + * + * The function that releases the explicit hold is simpler because it does not + * have to worry about the object moving: + * + * void + * object_rele(object_t *op) + * { + * rw_exit(OBJECT_RWLOCK(op)); + * } + * + * The caller is spared these details so that obtaining and releasing an + * explicit hold feels like a simple mutex_enter()/mutex_exit() pair. The caller + * of object_hold() only needs to know that the returned object pointer is valid + * if not NULL and that the object will not move until released. + * + * Although object_hold() prevents an object from moving, it does not prevent it + * from being freed. The caller must take measures before calling object_hold() + * (afterwards is too late) to ensure that the held object cannot be freed. The + * caller must do so without accessing the unsafe object reference, so any lock + * or reference count used to ensure the continued existence of the object must + * live outside the object itself. + * + * Obtaining a new object is a special case where an explicit hold is impossible + * for the caller. Any function that returns a newly allocated object (either as + * a return value, or as an in-out paramter) must return it already held; after + * the caller gets it is too late, since the object cannot be safely accessed + * without the level of indirection described earlier. The following + * object_alloc() example uses the same code shown earlier to transition a new + * object into the state of being recognized (by the client) as a known object. + * The function must acquire the hold (rw_enter) before that state transition + * makes the object movable: + * + * static object_t * + * object_alloc(container_t *container) + * { + * object_t *object = kmem_cache_create(object_cache, 0); + * ... set any initial state not set by the constructor ... + * rw_enter(OBJECT_RWLOCK(object), RW_READER); + * mutex_enter(&container->c_objects_lock); + * list_insert_tail(&container->c_objects, object); + * membar_producer(); + * object->o_container = container; + * mutex_exit(&container->c_objects_lock); + * return (object); + * } + * + * Functions that implicitly acquire an object hold (any function that calls + * object_alloc() to supply an object for the caller) need to be carefully noted + * so that the matching object_rele() is not neglected. Otherwise, leaked holds + * prevent all objects hashed to the affected rwlocks from ever being moved. + * + * The pointer to a held object can be hashed to the holding rwlock even after + * the object has been freed. Although it is possible to release the hold + * after freeing the object, you may decide to release the hold implicitly in + * whatever function frees the object, so as to release the hold as soon as + * possible, and for the sake of symmetry with the function that implicitly + * acquires the hold when it allocates the object. Here, object_free() releases + * the hold acquired by object_alloc(). Its implicit object_rele() forms a + * matching pair with object_hold(): + * + * void + * object_free(object_t *object) + * { + * container_t *container; + * + * ASSERT(object_held(object)); + * container = object->o_container; + * mutex_enter(&container->c_objects_lock); + * object->o_container = + * (void *)((uintptr_t)object->o_container | 0x1); + * list_remove(&container->c_objects, object); + * mutex_exit(&container->c_objects_lock); + * object_rele(object); + * kmem_cache_free(object_cache, object); + * } + * + * Note that object_free() cannot safely accept an object pointer as an argument + * unless the object is already held. Any function that calls object_free() + * needs to be carefully noted since it similarly forms a matching pair with + * object_hold(). + * + * To complete the picture, the following callback function implements the + * general solution by moving objects only if they are currently unheld: + * + * static kmem_cbrc_t + * object_move(void *buf, void *newbuf, size_t size, void *arg) + * { + * object_t *op = buf, *np = newbuf; + * container_t *container; + * + * container = op->o_container; + * if ((uintptr_t)container & 0x3) { + * return (KMEM_CBRC_DONT_KNOW); + * } + * + * // Ensure that the container structure does not go away. + * if (container_hold(container) == 0) { + * return (KMEM_CBRC_DONT_KNOW); + * } + * + * mutex_enter(&container->c_objects_lock); + * if (container != op->o_container) { + * mutex_exit(&container->c_objects_lock); + * container_rele(container); + * return (KMEM_CBRC_DONT_KNOW); + * } + * + * if (rw_tryenter(OBJECT_RWLOCK(op), RW_WRITER) == 0) { + * mutex_exit(&container->c_objects_lock); + * container_rele(container); + * return (KMEM_CBRC_LATER); + * } + * + * object_move_impl(op, np); // critical section + * rw_exit(OBJECT_RWLOCK(op)); + * + * op->o_container = (void *)((uintptr_t)op->o_container | 0x1); + * list_link_replace(&op->o_link_node, &np->o_link_node); + * mutex_exit(&container->c_objects_lock); + * container_rele(container); + * return (KMEM_CBRC_YES); + * } + * + * Note that object_move() must invalidate the designated o_container pointer of + * the old object in the same way that object_free() does, since kmem will free + * the object in response to the KMEM_CBRC_YES return value. + * + * The lock order in object_move() differs from object_alloc(), which locks + * OBJECT_RWLOCK first and &container->c_objects_lock second, but as long as the + * callback uses rw_tryenter() (preventing the deadlock described earlier), it's + * not a problem. Holding the lock on the object list in the example above + * through the entire callback not only prevents the object from going away, it + * also allows you to lock the list elsewhere and know that none of its elements + * will move during iteration. + * + * Adding an explicit hold everywhere an object from the cache is used is tricky + * and involves much more change to client code than a cache-specific solution + * that leverages existing state to decide whether or not an object is + * movable. However, this approach has the advantage that no object remains + * immovable for any significant length of time, making it extremely unlikely + * that long-lived allocations can continue holding slabs hostage; and it works + * for any cache. + * + * 3. Consolidator Implementation + * + * Once the client supplies a move function that a) recognizes known objects and + * b) avoids moving objects that are actively in use, the remaining work is up + * to the consolidator to decide which objects to move and when to issue + * callbacks. + * + * The consolidator relies on the fact that a cache's slabs are ordered by + * usage. Each slab has a fixed number of objects. Depending on the slab's + * "color" (the offset of the first object from the beginning of the slab; + * offsets are staggered to mitigate false sharing of cache lines) it is either + * the maximum number of objects per slab determined at cache creation time or + * else the number closest to the maximum that fits within the space remaining + * after the initial offset. A completely allocated slab may contribute some + * internal fragmentation (per-slab overhead) but no external fragmentation, so + * it is of no interest to the consolidator. At the other extreme, slabs whose + * objects have all been freed to the slab are released to the virtual memory + * (VM) subsystem (objects freed to magazines are still allocated as far as the + * slab is concerned). External fragmentation exists when there are slabs + * somewhere between these extremes. A partial slab has at least one but not all + * of its objects allocated. The more partial slabs, and the fewer allocated + * objects on each of them, the higher the fragmentation. Hence the + * consolidator's overall strategy is to reduce the number of partial slabs by + * moving allocated objects from the least allocated slabs to the most allocated + * slabs. + * + * Partial slabs are kept in an AVL tree ordered by usage. Completely allocated + * slabs are kept separately in an unordered list. Since the majority of slabs + * tend to be completely allocated (a typical unfragmented cache may have + * thousands of complete slabs and only a single partial slab), separating + * complete slabs improves the efficiency of partial slab ordering, since the + * complete slabs do not affect the depth or balance of the AVL tree. This + * ordered sequence of partial slabs acts as a "free list" supplying objects for + * allocation requests. + * + * Objects are always allocated from the first partial slab in the free list, + * where the allocation is most likely to eliminate a partial slab (by + * completely allocating it). Conversely, when a single object from a completely + * allocated slab is freed to the slab, that slab is added to the front of the + * free list. Since most free list activity involves highly allocated slabs + * coming and going at the front of the list, slabs tend naturally toward the + * ideal order: highly allocated at the front, sparsely allocated at the back. + * Slabs with few allocated objects are likely to become completely free if they + * keep a safe distance away from the front of the free list. Slab misorders + * interfere with the natural tendency of slabs to become completely free or + * completely allocated. For example, a slab with a single allocated object + * needs only a single free to escape the cache; its natural desire is + * frustrated when it finds itself at the front of the list where a second + * allocation happens just before the free could have released it. Another slab + * with all but one object allocated might have supplied the buffer instead, so + * that both (as opposed to neither) of the slabs would have been taken off the + * free list. + * + * Although slabs tend naturally toward the ideal order, misorders allowed by a + * simple list implementation defeat the consolidator's strategy of merging + * least- and most-allocated slabs. Without an AVL tree to guarantee order, kmem + * needs another way to fix misorders to optimize its callback strategy. One + * approach is to periodically scan a limited number of slabs, advancing a + * marker to hold the current scan position, and to move extreme misorders to + * the front or back of the free list and to the front or back of the current + * scan range. By making consecutive scan ranges overlap by one slab, the least + * allocated slab in the current range can be carried along from the end of one + * scan to the start of the next. + * + * Maintaining partial slabs in an AVL tree relieves kmem of this additional + * task, however. Since most of the cache's activity is in the magazine layer, + * and allocations from the slab layer represent only a startup cost, the + * overhead of maintaining a balanced tree is not a significant concern compared + * to the opportunity of reducing complexity by eliminating the partial slab + * scanner just described. The overhead of an AVL tree is minimized by + * maintaining only partial slabs in the tree and keeping completely allocated + * slabs separately in a list. To avoid increasing the size of the slab + * structure the AVL linkage pointers are reused for the slab's list linkage, + * since the slab will always be either partial or complete, never stored both + * ways at the same time. To further minimize the overhead of the AVL tree the + * compare function that orders partial slabs by usage divides the range of + * allocated object counts into bins such that counts within the same bin are + * considered equal. Binning partial slabs makes it less likely that allocating + * or freeing a single object will change the slab's order, requiring a tree + * reinsertion (an avl_remove() followed by an avl_add(), both potentially + * requiring some rebalancing of the tree). Allocation counts closest to + * completely free and completely allocated are left unbinned (finely sorted) to + * better support the consolidator's strategy of merging slabs at either + * extreme. + * + * 3.1 Assessing Fragmentation and Selecting Candidate Slabs + * + * The consolidator piggybacks on the kmem maintenance thread and is called on + * the same interval as kmem_cache_update(), once per cache every fifteen + * seconds. kmem maintains a running count of unallocated objects in the slab + * layer (cache_bufslab). The consolidator checks whether that number exceeds + * 12.5% (1/8) of the total objects in the cache (cache_buftotal), and whether + * there is a significant number of slabs in the cache (arbitrarily a minimum + * 101 total slabs). Unused objects that have fallen out of the magazine layer's + * working set are included in the assessment, and magazines in the depot are + * reaped if those objects would lift cache_bufslab above the fragmentation + * threshold. Once the consolidator decides that a cache is fragmented, it looks + * for a candidate slab to reclaim, starting at the end of the partial slab free + * list and scanning backwards. At first the consolidator is choosy: only a slab + * with fewer than 12.5% (1/8) of its objects allocated qualifies (or else a + * single allocated object, regardless of percentage). If there is difficulty + * finding a candidate slab, kmem raises the allocation threshold incrementally, + * up to a maximum 87.5% (7/8), so that eventually the consolidator will reduce + * external fragmentation (unused objects on the free list) below 12.5% (1/8), + * even in the worst case of every slab in the cache being almost 7/8 allocated. + * The threshold can also be lowered incrementally when candidate slabs are easy + * to find, and the threshold is reset to the minimum 1/8 as soon as the cache + * is no longer fragmented. + * + * 3.2 Generating Callbacks + * + * Once an eligible slab is chosen, a callback is generated for every allocated + * object on the slab, in the hope that the client will move everything off the + * slab and make it reclaimable. Objects selected as move destinations are + * chosen from slabs at the front of the free list. Assuming slabs in the ideal + * order (most allocated at the front, least allocated at the back) and a + * cooperative client, the consolidator will succeed in removing slabs from both + * ends of the free list, completely allocating on the one hand and completely + * freeing on the other. Objects selected as move destinations are allocated in + * the kmem maintenance thread where move requests are enqueued. A separate + * callback thread removes pending callbacks from the queue and calls the + * client. The separate thread ensures that client code (the move function) does + * not interfere with internal kmem maintenance tasks. A map of pending + * callbacks keyed by object address (the object to be moved) is checked to + * ensure that duplicate callbacks are not generated for the same object. + * Allocating the move destination (the object to move to) prevents subsequent + * callbacks from selecting the same destination as an earlier pending callback. + * + * Move requests can also be generated by kmem_cache_reap() when the system is + * desperate for memory and by kmem_cache_move_notify(), called by the client to + * notify kmem that a move refused earlier with KMEM_CBRC_LATER is now possible. + * The map of pending callbacks is protected by the same lock that protects the + * slab layer. + * + * When the system is desperate for memory, kmem does not bother to determine + * whether or not the cache exceeds the fragmentation threshold, but tries to + * consolidate as many slabs as possible. Normally, the consolidator chews + * slowly, one sparsely allocated slab at a time during each maintenance + * interval that the cache is fragmented. When desperate, the consolidator + * starts at the last partial slab and enqueues callbacks for every allocated + * object on every partial slab, working backwards until it reaches the first + * partial slab. The first partial slab, meanwhile, advances in pace with the + * consolidator as allocations to supply move destinations for the enqueued + * callbacks use up the highly allocated slabs at the front of the free list. + * Ideally, the overgrown free list collapses like an accordion, starting at + * both ends and ending at the center with a single partial slab. + * + * 3.3 Client Responses + * + * When the client returns KMEM_CBRC_NO in response to the move callback, kmem + * marks the slab that supplied the stuck object non-reclaimable and moves it to + * front of the free list. The slab remains marked as long as it remains on the + * free list, and it appears more allocated to the partial slab compare function + * than any unmarked slab, no matter how many of its objects are allocated. + * Since even one immovable object ties up the entire slab, the goal is to + * completely allocate any slab that cannot be completely freed. kmem does not + * bother generating callbacks to move objects from a marked slab unless the + * system is desperate. + * + * When the client responds KMEM_CBRC_LATER, kmem increments a count for the + * slab. If the client responds LATER too many times, kmem disbelieves and + * treats the response as a NO. The count is cleared when the slab is taken off + * the partial slab list or when the client moves one of the slab's objects. + * + * 4. Observability + * + * A kmem cache's external fragmentation is best observed with 'mdb -k' using + * the ::kmem_slabs dcmd. For a complete description of the command, enter + * '::help kmem_slabs' at the mdb prompt. */ #include <sys/kmem_impl.h> @@ -50,6 +813,7 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> +#include <sys/sdt.h> #include <sys/mutex.h> #include <sys/bitmap.h> #include <sys/atomic.h> @@ -64,6 +828,9 @@ #include <sys/id32.h> #include <sys/zone.h> #include <sys/netstack.h> +#ifdef DEBUG +#include <sys/random.h> +#endif extern void streams_msg_init(void); extern int segkp_fromheap; @@ -96,6 +863,13 @@ struct kmem_cache_kstat { kstat_named_t kmc_full_magazines; kstat_named_t kmc_empty_magazines; kstat_named_t kmc_magazine_size; + kstat_named_t kmc_move_callbacks; + kstat_named_t kmc_move_yes; + kstat_named_t kmc_move_no; + kstat_named_t kmc_move_later; + kstat_named_t kmc_move_dont_need; + kstat_named_t kmc_move_dont_know; + kstat_named_t kmc_move_hunt_found; } kmem_cache_kstat = { { "buf_size", KSTAT_DATA_UINT64 }, { "align", KSTAT_DATA_UINT64 }, @@ -123,6 +897,13 @@ struct kmem_cache_kstat { { "full_magazines", KSTAT_DATA_UINT64 }, { "empty_magazines", KSTAT_DATA_UINT64 }, { "magazine_size", KSTAT_DATA_UINT64 }, + { "move_callbacks", KSTAT_DATA_UINT64 }, + { "move_yes", KSTAT_DATA_UINT64 }, + { "move_no", KSTAT_DATA_UINT64 }, + { "move_later", KSTAT_DATA_UINT64 }, + { "move_dont_need", KSTAT_DATA_UINT64 }, + { "move_dont_know", KSTAT_DATA_UINT64 }, + { "move_hunt_found", KSTAT_DATA_UINT64 }, }; static kmutex_t kmem_cache_kstat_lock; @@ -210,7 +991,7 @@ static kmem_cache_t *kmem_bufctl_cache; static kmem_cache_t *kmem_bufctl_audit_cache; static kmutex_t kmem_cache_lock; /* inter-cache linkage only */ -kmem_cache_t kmem_null_cache; +static list_t kmem_caches; static taskq_t *kmem_taskq; static kmutex_t kmem_flags_lock; @@ -225,6 +1006,101 @@ static vmem_t *kmem_default_arena; static vmem_t *kmem_firewall_va_arena; static vmem_t *kmem_firewall_arena; +/* + * Define KMEM_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define KMEM_STATS +#endif /* DEBUG */ + +#ifdef KMEM_STATS +#define KMEM_STAT_ADD(stat) ((stat)++) +#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) +#else +#define KMEM_STAT_ADD(stat) /* nothing */ +#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */ +#endif /* KMEM_STATS */ + +/* + * kmem slab consolidator thresholds (tunables) + */ +static size_t kmem_frag_minslabs = 101; /* minimum total slabs */ +static size_t kmem_frag_numer = 1; /* free buffers (numerator) */ +static size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */ +/* + * Maximum number of slabs from which to move buffers during a single + * maintenance interval while the system is not low on memory. + */ +static size_t kmem_reclaim_max_slabs = 1; +/* + * Number of slabs to scan backwards from the end of the partial slab list + * when searching for buffers to relocate. + */ +static size_t kmem_reclaim_scan_range = 12; + +#ifdef KMEM_STATS +static struct { + uint64_t kms_callbacks; + uint64_t kms_yes; + uint64_t kms_no; + uint64_t kms_later; + uint64_t kms_dont_need; + uint64_t kms_dont_know; + uint64_t kms_hunt_found_slab; + uint64_t kms_hunt_found_mag; + uint64_t kms_hunt_notfound; + uint64_t kms_hunt_alloc_fail; + uint64_t kms_hunt_lucky; + uint64_t kms_notify; + uint64_t kms_notify_callbacks; + uint64_t kms_disbelief; + uint64_t kms_already_pending; + uint64_t kms_callback_alloc_fail; + uint64_t kms_endscan_slab_destroyed; + uint64_t kms_endscan_nomem; + uint64_t kms_endscan_slab_all_used; + uint64_t kms_endscan_refcnt_changed; + uint64_t kms_endscan_nomove_changed; + uint64_t kms_endscan_freelist; + uint64_t kms_avl_update; + uint64_t kms_avl_noupdate; + uint64_t kms_no_longer_reclaimable; + uint64_t kms_notify_no_longer_reclaimable; + uint64_t kms_alloc_fail; + uint64_t kms_constructor_fail; + uint64_t kms_dead_slabs_freed; + uint64_t kms_defrags; + uint64_t kms_scan_depot_ws_reaps; + uint64_t kms_debug_reaps; + uint64_t kms_debug_move_scans; +} kmem_move_stats; +#endif /* KMEM_STATS */ + +/* consolidator knobs */ +static boolean_t kmem_move_noreap; +static boolean_t kmem_move_blocked; +static boolean_t kmem_move_fulltilt; +static boolean_t kmem_move_any_partial; + +#ifdef DEBUG +/* + * Ensure code coverage by occasionally running the consolidator even when the + * caches are not fragmented (they may never be). These intervals are mean time + * in cache maintenance intervals (kmem_cache_update). + */ +static int kmem_mtb_move = 60; /* defrag 1 slab (~15min) */ +static int kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */ +#endif /* DEBUG */ + +static kmem_cache_t *kmem_defrag_cache; +static kmem_cache_t *kmem_move_cache; +static taskq_t *kmem_move_taskq; + +static void kmem_cache_scan(kmem_cache_t *); +static void kmem_cache_defrag(kmem_cache_t *); + + kmem_log_header_t *kmem_transaction_log; kmem_log_header_t *kmem_content_log; kmem_log_header_t *kmem_failure_log; @@ -310,8 +1186,8 @@ kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) kmem_cache_t *cp; mutex_enter(&kmem_cache_lock); - for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache; - cp = cp->cache_next) + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) if (tq != NULL) (void) taskq_dispatch(tq, (task_func_t *)func, cp, tqflag); @@ -326,8 +1202,8 @@ kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) kmem_cache_t *cp; mutex_enter(&kmem_cache_lock); - for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache; - cp = cp->cache_next) { + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { if (!(cp->cache_cflags & KMC_IDENTIFIER)) continue; if (tq != NULL) @@ -348,8 +1224,15 @@ kmem_findslab(kmem_cache_t *cp, void *buf) kmem_slab_t *sp; mutex_enter(&cp->cache_lock); - for (sp = cp->cache_nullslab.slab_next; - sp != &cp->cache_nullslab; sp = sp->slab_next) { + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { if (KMEM_SLAB_MEMBER(sp, buf)) { mutex_exit(&cp->cache_lock); return (sp); @@ -376,8 +1259,8 @@ kmem_error(int error, kmem_cache_t *cparg, void *bufarg) sp = kmem_findslab(cp, buf); if (sp == NULL) { - for (cp = kmem_null_cache.cache_prev; cp != &kmem_null_cache; - cp = cp->cache_prev) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { if ((sp = kmem_findslab(cp, buf)) != NULL) break; } @@ -615,6 +1498,8 @@ kmem_slab_create(kmem_cache_t *cp, int kmflag) kmem_bufctl_t *bcp; vmem_t *vmp = cp->cache_arena; + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + color = cp->cache_color + cp->cache_align; if (color > cp->cache_maxcolor) color = cp->cache_mincolor; @@ -627,6 +1512,13 @@ kmem_slab_create(kmem_cache_t *cp, int kmflag) ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0); + /* + * Reverify what was already checked in kmem_cache_set_move(), since the + * consolidator depends (for correctness) on slabs being initialized + * with the 0xbaddcafe memory pattern (setting a low order bit usable by + * clients to distinguish uninitialized memory from known objects). + */ + ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH)); if (!(cp->cache_cflags & KMC_NOTOUCH)) copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize); @@ -644,6 +1536,9 @@ kmem_slab_create(kmem_cache_t *cp, int kmflag) sp->slab_refcnt = 0; sp->slab_base = buf = slab + color; sp->slab_chunks = chunks; + sp->slab_stuck_offset = (uint32_t)-1; + sp->slab_later_count = 0; + sp->slab_flags = 0; ASSERT(chunks > 0); while (chunks-- != 0) { @@ -710,6 +1605,9 @@ kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp) vmem_t *vmp = cp->cache_arena; void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_refcnt == 0); + if (cp->cache_flags & KMF_HASH) { kmem_bufctl_t *bcp; while ((bcp = sp->slab_head) != NULL) { @@ -721,53 +1619,51 @@ kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp) vmem_free(vmp, slab, cp->cache_slabsize); } -/* - * Allocate a raw (unconstructed) buffer from cp's slab layer. - */ static void * -kmem_slab_alloc(kmem_cache_t *cp, int kmflag) +kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp) { kmem_bufctl_t *bcp, **hash_bucket; - kmem_slab_t *sp; void *buf; - mutex_enter(&cp->cache_lock); - cp->cache_slab_alloc++; - sp = cp->cache_freelist; + ASSERT(MUTEX_HELD(&cp->cache_lock)); + /* + * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we + * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the + * slab is newly created (sp->slab_refcnt == 0). + */ + ASSERT((sp->slab_refcnt == 0) || (KMEM_SLAB_IS_PARTIAL(sp) && + (sp == avl_first(&cp->cache_partial_slabs)))); ASSERT(sp->slab_cache == cp); - if (sp->slab_head == NULL) { - ASSERT(cp->cache_bufslab == 0); - - /* - * The freelist is empty. Create a new slab. - */ - mutex_exit(&cp->cache_lock); - if ((sp = kmem_slab_create(cp, kmflag)) == NULL) - return (NULL); - mutex_enter(&cp->cache_lock); - cp->cache_slab_create++; - if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) - cp->cache_bufmax = cp->cache_buftotal; - cp->cache_bufslab += sp->slab_chunks; - sp->slab_next = cp->cache_freelist; - sp->slab_prev = cp->cache_freelist->slab_prev; - sp->slab_next->slab_prev = sp; - sp->slab_prev->slab_next = sp; - cp->cache_freelist = sp; - } + cp->cache_slab_alloc++; cp->cache_bufslab--; sp->slab_refcnt++; - ASSERT(sp->slab_refcnt <= sp->slab_chunks); - /* - * If we're taking the last buffer in the slab, - * remove the slab from the cache's freelist. - */ bcp = sp->slab_head; if ((sp->slab_head = bcp->bc_next) == NULL) { - cp->cache_freelist = sp->slab_next; - ASSERT(sp->slab_refcnt == sp->slab_chunks); + ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); + if (sp->slab_refcnt == 1) { + ASSERT(sp->slab_chunks == 1); + } else { + ASSERT(sp->slab_chunks > 1); /* the slab was partial */ + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; /* clear history */ + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + } + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + } else { + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + if (sp->slab_refcnt == 1) { + avl_add(&cp->cache_partial_slabs, sp); + } else { + /* + * The slab is now more allocated than it was, so the + * order remains unchanged. + */ + ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); + } } if (cp->cache_flags & KMF_HASH) { @@ -786,12 +1682,49 @@ kmem_slab_alloc(kmem_cache_t *cp, int kmflag) } ASSERT(KMEM_SLAB_MEMBER(sp, buf)); + return (buf); +} + +/* + * Allocate a raw (unconstructed) buffer from cp's slab layer. + */ +static void * +kmem_slab_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_slab_t *sp; + void *buf; + + mutex_enter(&cp->cache_lock); + sp = avl_first(&cp->cache_partial_slabs); + if (sp == NULL) { + ASSERT(cp->cache_bufslab == 0); + + /* + * The freelist is empty. Create a new slab. + */ + mutex_exit(&cp->cache_lock); + if ((sp = kmem_slab_create(cp, kmflag)) == NULL) { + return (NULL); + } + mutex_enter(&cp->cache_lock); + cp->cache_slab_create++; + if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) + cp->cache_bufmax = cp->cache_buftotal; + cp->cache_bufslab += sp->slab_chunks; + } + buf = kmem_slab_alloc_impl(cp, sp); + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); mutex_exit(&cp->cache_lock); return (buf); } +static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); + /* * Free a raw (unconstructed) buffer to cp's slab layer. */ @@ -831,6 +1764,17 @@ kmem_slab_free(kmem_cache_t *cp, void *buf) return; } + if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) { + /* + * If this is the buffer that prevented the consolidator from + * clearing the slab, we can reset the slab flags now that the + * buffer is freed. (It makes sense to do this in + * kmem_cache_free(), where the client gives up ownership of the + * buffer, but on the hot path the test is too expensive.) + */ + kmem_slab_move_yes(cp, sp, buf); + } + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { if (cp->cache_flags & KMF_CONTENTS) ((kmem_bufctl_audit_t *)bcp)->bc_contents = @@ -839,45 +1783,93 @@ kmem_slab_free(kmem_cache_t *cp, void *buf) KMEM_AUDIT(kmem_transaction_log, cp, bcp); } - /* - * If this slab isn't currently on the freelist, put it there. - */ - if (sp->slab_head == NULL) { - ASSERT(sp->slab_refcnt == sp->slab_chunks); - ASSERT(cp->cache_freelist != sp); - sp->slab_next->slab_prev = sp->slab_prev; - sp->slab_prev->slab_next = sp->slab_next; - sp->slab_next = cp->cache_freelist; - sp->slab_prev = cp->cache_freelist->slab_prev; - sp->slab_next->slab_prev = sp; - sp->slab_prev->slab_next = sp; - cp->cache_freelist = sp; - } - bcp->bc_next = sp->slab_head; sp->slab_head = bcp; cp->cache_bufslab++; ASSERT(sp->slab_refcnt >= 1); + if (--sp->slab_refcnt == 0) { /* * There are no outstanding allocations from this slab, * so we can reclaim the memory. */ - sp->slab_next->slab_prev = sp->slab_prev; - sp->slab_prev->slab_next = sp->slab_next; - if (sp == cp->cache_freelist) - cp->cache_freelist = sp->slab_next; - cp->cache_slab_destroy++; + if (sp->slab_chunks == 1) { + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + } else { + avl_remove(&cp->cache_partial_slabs, sp); + } + cp->cache_buftotal -= sp->slab_chunks; cp->cache_bufslab -= sp->slab_chunks; - mutex_exit(&cp->cache_lock); - kmem_slab_destroy(cp, sp); + /* + * Defer releasing the slab to the virtual memory subsystem + * while there is a pending move callback, since we guarantee + * that buffers passed to the move callback have only been + * touched by kmem or by the client itself. Since the memory + * patterns baddcafe (uninitialized) and deadbeef (freed) both + * set at least one of the two lowest order bits, the client can + * test those bits in the move callback to determine whether or + * not it knows about the buffer (assuming that the client also + * sets one of those low order bits whenever it frees a buffer). + */ + if (cp->cache_defrag == NULL || + (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) && + !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) { + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + } else { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + /* + * Slabs are inserted at both ends of the deadlist to + * distinguish between slabs freed while move callbacks + * are pending (list head) and a slab freed while the + * lock is dropped in kmem_move_buffers() (list tail) so + * that in both cases slab_destroy() is called from the + * right context. + */ + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + } else { + list_insert_head(deadlist, sp); + } + cp->cache_defrag->kmd_deadcount++; + mutex_exit(&cp->cache_lock); + } return; } + + if (bcp->bc_next == NULL) { + /* Transition the slab from completely allocated to partial. */ + ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1)); + ASSERT(sp->slab_chunks > 1); + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + avl_add(&cp->cache_partial_slabs, sp); + } else { +#ifdef DEBUG + if (avl_update_gt(&cp->cache_partial_slabs, sp)) { + KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); + } else { + KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); + } +#else + (void) avl_update_gt(&cp->cache_partial_slabs, sp); +#endif + } + + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); mutex_exit(&cp->cache_lock); } +/* + * Return -1 if kmem_error, 1 if constructor fails, 0 if successful. + */ static int kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct, caddr_t caller) @@ -937,7 +1929,7 @@ kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct, if (cp->cache_flags & KMF_DEADBEEF) copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); kmem_slab_free(cp, buf); - return (-1); + return (1); } if (cp->cache_flags & KMF_AUDIT) { @@ -1016,7 +2008,8 @@ kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds) { int round; - ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread)); + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); for (round = 0; round < nrounds; round++) { void *buf = mp->mag_round[round]; @@ -1113,7 +2106,8 @@ kmem_depot_ws_reap(kmem_cache_t *cp) long reap; kmem_magazine_t *mp; - ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread)); + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) @@ -1159,7 +2153,7 @@ kmem_cache_alloc(kmem_cache_t *cp, int kmflag) mutex_exit(&ccp->cc_lock); if ((ccp->cc_flags & KMF_BUFTAG) && kmem_cache_alloc_debug(cp, buf, kmflag, 0, - caller()) == -1) { + caller()) != 0) { if (kmflag & KM_NOSLEEP) return (NULL); mutex_enter(&ccp->cc_lock); @@ -1216,14 +2210,17 @@ kmem_cache_alloc(kmem_cache_t *cp, int kmflag) /* * Make kmem_cache_alloc_debug() apply the constructor for us. */ - if (kmem_cache_alloc_debug(cp, buf, kmflag, 1, - caller()) == -1) { + int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller()); + if (rc != 0) { if (kmflag & KM_NOSLEEP) return (NULL); /* * kmem_cache_alloc_debug() detected corruption - * but didn't panic (kmem_panic <= 0). Try again. + * but didn't panic (kmem_panic <= 0). We should not be + * here because the constructor failed (indicated by a + * return code of 1). Try again. */ + ASSERT(rc == -1); return (kmem_cache_alloc(cp, kmflag)); } return (buf); @@ -1240,6 +2237,38 @@ kmem_cache_alloc(kmem_cache_t *cp, int kmflag) } /* + * The freed argument tells whether or not kmem_cache_free_debug() has already + * been called so that we can avoid the duplicate free error. For example, a + * buffer on a magazine has already been freed by the client but is still + * constructed. + */ +static void +kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed) +{ + if (!freed && (cp->cache_flags & KMF_BUFTAG)) + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + + /* + * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not, + * kmem_cache_free_debug() will have already applied the destructor. + */ + if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF && + cp->cache_destructor != NULL) { + if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } else { + cp->cache_destructor(buf, cp->cache_private); + } + } + + kmem_slab_free(cp, buf); +} + +/* * Free a constructed object to cache cp. */ void @@ -1249,6 +2278,15 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) kmem_magazine_t *emp; kmem_magtype_t *mtp; + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != curthread || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + if (ccp->cc_flags & KMF_BUFTAG) if (kmem_cache_free_debug(cp, buf, caller()) == -1) return; @@ -1337,22 +2375,8 @@ kmem_cache_free(kmem_cache_t *cp, void *buf) /* * We couldn't free our constructed object to the magazine layer, * so apply its destructor and free it to the slab layer. - * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not, - * kmem_cache_free_debug() will have already applied the destructor. */ - if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF && - cp->cache_destructor != NULL) { - if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */ - kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); - *(uint64_t *)buf = btp->bt_redzone; - cp->cache_destructor(buf, cp->cache_private); - *(uint64_t *)buf = KMEM_FREE_PATTERN; - } else { - cp->cache_destructor(buf, cp->cache_private); - } - } - - kmem_slab_free(cp, buf); + kmem_slab_free_constructed(cp, buf, B_TRUE); } void * @@ -1527,6 +2551,8 @@ kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag) static void kmem_cache_reap(kmem_cache_t *cp) { + ASSERT(taskq_member(kmem_taskq, curthread)); + /* * Ask the cache's owner to free some memory if possible. * The idea is to handle things like the inode cache, which @@ -1534,10 +2560,29 @@ kmem_cache_reap(kmem_cache_t *cp) * *need*. Reclaim policy is entirely up to the owner; this * callback is just an advisory plea for help. */ - if (cp->cache_reclaim != NULL) + if (cp->cache_reclaim != NULL) { + long delta; + + /* + * Reclaimed memory should be reapable (not included in the + * depot's working set). + */ + delta = cp->cache_full.ml_total; cp->cache_reclaim(cp->cache_private); + delta = cp->cache_full.ml_total - delta; + if (delta > 0) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit += delta; + cp->cache_full.ml_min += delta; + mutex_exit(&cp->cache_depot_lock); + } + } kmem_depot_ws_reap(cp); + + if (cp->cache_defrag != NULL && !kmem_move_noreap) { + kmem_cache_defrag(cp); + } } static void @@ -1634,7 +2679,8 @@ kmem_cache_magazine_purge(kmem_cache_t *cp) kmem_magazine_t *mp, *pmp; int rounds, prounds, cpu_seqid; - ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread)); + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { @@ -1696,6 +2742,8 @@ kmem_cache_magazine_enable(kmem_cache_t *cp) void kmem_cache_reap_now(kmem_cache_t *cp) { + ASSERT(list_link_active(&cp->cache_link)); + kmem_depot_ws_update(cp); kmem_depot_ws_update(cp); @@ -1785,8 +2833,8 @@ kmem_hash_rescale(kmem_cache_t *cp) } /* - * Perform periodic maintenance on a cache: hash rescaling, - * depot working-set update, and magazine resizing. + * Perform periodic maintenance on a cache: hash rescaling, depot working-set + * update, magazine resizing, and slab consolidation. */ static void kmem_cache_update(kmem_cache_t *cp) @@ -1837,6 +2885,10 @@ kmem_cache_update(kmem_cache_t *cp) if (need_magazine_resize) (void) taskq_dispatch(kmem_taskq, (task_func_t *)kmem_cache_magazine_resize, cp, TQ_NOSLEEP); + + if (cp->cache_defrag != NULL) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP); } static void @@ -1936,6 +2988,25 @@ kmem_cache_kstat_update(kstat_t *ksp, int rw) kmcp->kmc_hash_rescale.value.ui64 = cp->cache_rescale; kmcp->kmc_vmem_source.value.ui64 = cp->cache_arena->vm_id; + if (cp->cache_defrag == NULL) { + kmcp->kmc_move_callbacks.value.ui64 = 0; + kmcp->kmc_move_yes.value.ui64 = 0; + kmcp->kmc_move_no.value.ui64 = 0; + kmcp->kmc_move_later.value.ui64 = 0; + kmcp->kmc_move_dont_need.value.ui64 = 0; + kmcp->kmc_move_dont_know.value.ui64 = 0; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + } else { + kmem_defrag_t *kd = cp->cache_defrag; + kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks; + kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes; + kmcp->kmc_move_no.value.ui64 = kd->kmd_no; + kmcp->kmc_move_later.value.ui64 = kd->kmd_later; + kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; + kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; + kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found; + } + mutex_exit(&cp->cache_lock); return (0); } @@ -2007,6 +3078,109 @@ kmem_debugging(void) return (kmem_flags & (KMF_AUDIT | KMF_REDZONE)); } +/* binning function, sorts finely at the two extremes */ +#define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift) \ + ((((sp)->slab_refcnt <= (binshift)) || \ + (((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift))) \ + ? -(sp)->slab_refcnt \ + : -((binshift) + ((sp)->slab_refcnt >> (binshift)))) + +/* + * Minimizing the number of partial slabs on the freelist minimizes + * fragmentation (the ratio of unused buffers held by the slab layer). There are + * two ways to get a slab off of the freelist: 1) free all the buffers on the + * slab, and 2) allocate all the buffers on the slab. It follows that we want + * the most-used slabs at the front of the list where they have the best chance + * of being completely allocated, and the least-used slabs at a safe distance + * from the front to improve the odds that the few remaining buffers will all be + * freed before another allocation can tie up the slab. For that reason a slab + * with a higher slab_refcnt sorts less than than a slab with a lower + * slab_refcnt. + * + * However, if a slab has at least one buffer that is deemed unfreeable, we + * would rather have that slab at the front of the list regardless of + * slab_refcnt, since even one unfreeable buffer makes the entire slab + * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move() + * callback, the slab is marked unfreeable for as long as it remains on the + * freelist. + */ +static int +kmem_partial_slab_cmp(const void *p0, const void *p1) +{ + const kmem_cache_t *cp; + const kmem_slab_t *s0 = p0; + const kmem_slab_t *s1 = p1; + int w0, w1; + size_t binshift; + + ASSERT(KMEM_SLAB_IS_PARTIAL(s0)); + ASSERT(KMEM_SLAB_IS_PARTIAL(s1)); + ASSERT(s0->slab_cache == s1->slab_cache); + cp = s1->slab_cache; + ASSERT(MUTEX_HELD(&cp->cache_lock)); + binshift = cp->cache_partial_binshift; + + /* weight of first slab */ + w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, binshift); + if (s0->slab_flags & KMEM_SLAB_NOMOVE) { + w0 -= cp->cache_maxchunks; + } + + /* weight of second slab */ + w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, binshift); + if (s1->slab_flags & KMEM_SLAB_NOMOVE) { + w1 -= cp->cache_maxchunks; + } + + if (w0 < w1) + return (-1); + if (w0 > w1) + return (1); + + /* compare pointer values */ + if ((uintptr_t)s0 < (uintptr_t)s1) + return (-1); + if ((uintptr_t)s0 > (uintptr_t)s1) + return (1); + + return (0); +} + +static void +kmem_check_destructor(kmem_cache_t *cp) +{ + if (cp->cache_destructor == NULL) + return; + + /* + * Assert that it is valid to call the destructor on a newly constructed + * object without any intervening client code using the object. + * Allocate from the slab layer to ensure that the client has not + * touched the buffer. + */ + void *buf = kmem_slab_alloc(cp, KM_NOSLEEP); + if (buf == NULL) + return; + + if (cp->cache_flags & KMF_BUFTAG) { + if (kmem_cache_alloc_debug(cp, buf, KM_NOSLEEP, 1, + caller()) != 0) + return; + } else if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, KM_NOSLEEP) != 0) { + atomic_add_64(&cp->cache_alloc_fail, 1); + kmem_slab_free(cp, buf); + return; + } + + kmem_slab_free_constructed(cp, buf, B_FALSE); +} + +/* + * It must be valid to call the destructor (if any) on a newly created object. + * That is, the constructor (if any) must leave the object in a valid state for + * the destructor. + */ kmem_cache_t * kmem_cache_create( char *name, /* descriptive name for this cache */ @@ -2021,7 +3195,7 @@ kmem_cache_create( { int cpu_seqid; size_t chunksize; - kmem_cache_t *cp, *cnext, *cprev; + kmem_cache_t *cp; kmem_magtype_t *mtp; size_t csize = KMEM_CACHE_SIZE(max_ncpus); @@ -2056,6 +3230,7 @@ kmem_cache_create( cp = vmem_xalloc(kmem_cache_arena, csize, KMEM_CPU_CACHE_SIZE, P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), 0, NULL, NULL, VM_SLEEP); bzero(cp, csize); + list_link_init(&cp->cache_link); if (align == 0) align = KMEM_ALIGN; @@ -2136,7 +3311,7 @@ kmem_cache_create( * Set cache properties. */ (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN); - strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN); + strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1); cp->cache_bufsize = bufsize; cp->cache_align = align; cp->cache_constructor = constructor; @@ -2215,6 +3390,9 @@ kmem_cache_create( cp->cache_flags |= KMF_HASH; } + cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize); + cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1; + if (cp->cache_flags & KMF_HASH) { ASSERT(!(cflags & KMC_NOHASH)); cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ? @@ -2231,11 +3409,13 @@ kmem_cache_create( */ mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL); - cp->cache_freelist = &cp->cache_nullslab; - cp->cache_nullslab.slab_cache = cp; - cp->cache_nullslab.slab_refcnt = -1; - cp->cache_nullslab.slab_next = &cp->cache_nullslab; - cp->cache_nullslab.slab_prev = &cp->cache_nullslab; + avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse partial slab AVL linkage for complete slab list linkage */ + list_create(&cp->cache_complete_slabs, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); if (cp->cache_flags & KMF_HASH) { cp->cache_hash_table = vmem_alloc(kmem_hash_arena, @@ -2286,18 +3466,117 @@ kmem_cache_create( * to kmem_update(), so the cache must be ready for business. */ mutex_enter(&kmem_cache_lock); - cp->cache_next = cnext = &kmem_null_cache; - cp->cache_prev = cprev = kmem_null_cache.cache_prev; - cnext->cache_prev = cp; - cprev->cache_next = cp; + list_insert_tail(&kmem_caches, cp); mutex_exit(&kmem_cache_lock); if (kmem_ready) kmem_cache_magazine_enable(cp); + if (kmem_move_taskq != NULL && cp->cache_destructor != NULL) { + (void) taskq_dispatch(kmem_move_taskq, + (task_func_t *)kmem_check_destructor, cp, + TQ_NOSLEEP); + } + return (cp); } +static int +kmem_move_cmp(const void *buf, const void *p) +{ + const kmem_move_t *kmm = p; + uintptr_t v1 = (uintptr_t)buf; + uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf; + return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0)); +} + +static void +kmem_reset_reclaim_threshold(kmem_defrag_t *kmd) +{ + kmd->kmd_reclaim_numer = 1; +} + +/* + * Initially, when choosing candidate slabs for buffers to move, we want to be + * very selective and take only slabs that are less than + * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate + * slabs, then we raise the allocation ceiling incrementally. The reclaim + * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no + * longer fragmented. + */ +static void +kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction) +{ + if (direction > 0) { + /* make it easier to find a candidate slab */ + if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) { + kmd->kmd_reclaim_numer++; + } + } else { + /* be more selective */ + if (kmd->kmd_reclaim_numer > 1) { + kmd->kmd_reclaim_numer--; + } + } +} + +void +kmem_cache_set_move(kmem_cache_t *cp, + kmem_cbrc_t (*move)(void *, void *, size_t, void *)) +{ + kmem_defrag_t *defrag; + + ASSERT(move != NULL); + /* + * The consolidator does not support NOTOUCH caches because kmem cannot + * initialize their slabs with the 0xbaddcafe memory pattern, which sets + * a low order bit usable by clients to distinguish uninitialized memory + * from known objects (see kmem_slab_create). + */ + ASSERT(!(cp->cache_cflags & KMC_NOTOUCH)); + ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER)); + + /* + * We should not be holding anyone's cache lock when calling + * kmem_cache_alloc(), so allocate in all cases before acquiring the + * lock. + */ + defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP); + + mutex_enter(&cp->cache_lock); + + if (KMEM_IS_MOVABLE(cp)) { + if (cp->cache_move == NULL) { + /* + * The client must not have allocated any objects from + * this cache before setting a move callback function. + */ + ASSERT(cp->cache_bufmax == 0); + + cp->cache_defrag = defrag; + defrag = NULL; /* nothing to free */ + bzero(cp->cache_defrag, sizeof (kmem_defrag_t)); + avl_create(&cp->cache_defrag->kmd_moves_pending, + kmem_move_cmp, sizeof (kmem_move_t), + offsetof(kmem_move_t, kmm_entry)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse the slab's AVL linkage for deadlist linkage */ + list_create(&cp->cache_defrag->kmd_deadlist, + sizeof (kmem_slab_t), + offsetof(kmem_slab_t, slab_link)); + kmem_reset_reclaim_threshold(cp->cache_defrag); + } + cp->cache_move = move; + } + + mutex_exit(&cp->cache_lock); + + if (defrag != NULL) { + kmem_cache_free(kmem_defrag_cache, defrag); /* unused */ + } +} + void kmem_cache_destroy(kmem_cache_t *cp) { @@ -2309,13 +3588,13 @@ kmem_cache_destroy(kmem_cache_t *cp) * complete, purge the cache, and then destroy it. */ mutex_enter(&kmem_cache_lock); - cp->cache_prev->cache_next = cp->cache_next; - cp->cache_next->cache_prev = cp->cache_prev; - cp->cache_prev = cp->cache_next = NULL; + list_remove(&kmem_caches, cp); mutex_exit(&kmem_cache_lock); if (kmem_taskq != NULL) taskq_wait(kmem_taskq); + if (kmem_move_taskq != NULL) + taskq_wait(kmem_move_taskq); kmem_cache_magazine_purge(cp); @@ -2323,14 +3602,22 @@ kmem_cache_destroy(kmem_cache_t *cp) if (cp->cache_buftotal != 0) cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty", cp->cache_name, (void *)cp); - cp->cache_reclaim = NULL; + if (cp->cache_defrag != NULL) { + avl_destroy(&cp->cache_defrag->kmd_moves_pending); + list_destroy(&cp->cache_defrag->kmd_deadlist); + kmem_cache_free(kmem_defrag_cache, cp->cache_defrag); + cp->cache_defrag = NULL; + } /* - * The cache is now dead. There should be no further activity. - * We enforce this by setting land mines in the constructor and - * destructor routines that induce a kernel text fault if invoked. + * The cache is now dead. There should be no further activity. We + * enforce this by setting land mines in the constructor, destructor, + * reclaim, and move routines that induce a kernel text fault if + * invoked. */ cp->cache_constructor = (int (*)(void *, void *, int))1; cp->cache_destructor = (void (*)(void *, void *))2; + cp->cache_reclaim = (void (*)(void *))3; + cp->cache_move = (kmem_cbrc_t (*)(void *, void *, size_t, void *))4; mutex_exit(&cp->cache_lock); kstat_delete(cp->cache_kstat); @@ -2473,8 +3760,8 @@ kmem_init(void) /* LINTED */ ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE); - kmem_null_cache.cache_next = &kmem_null_cache; - kmem_null_cache.cache_prev = &kmem_null_cache; + list_create(&kmem_caches, sizeof (kmem_cache_t), + offsetof(kmem_cache_t, cache_link)); kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE, vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE, @@ -2505,9 +3792,6 @@ kmem_init(void) kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE, segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); - kmem_null_cache.cache_next = &kmem_null_cache; - kmem_null_cache.cache_prev = &kmem_null_cache; - kmem_reap_interval = 15 * hz; /* @@ -2522,7 +3806,7 @@ kmem_init(void) mod_read_system_file(boothowto & RB_ASKNAME); - while ((cp = kmem_null_cache.cache_prev) != &kmem_null_cache) + while ((cp = list_tail(&kmem_caches)) != NULL) kmem_cache_destroy(cp); vmem_destroy(kmem_oversize_arena); @@ -2660,11 +3944,34 @@ kmem_init(void) netstack_init(); } +static void +kmem_move_init(void) +{ + kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache", + sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + kmem_move_cache = kmem_cache_create("kmem_move_cache", + sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + /* + * kmem guarantees that move callbacks are sequential and that even + * across multiple caches no two moves ever execute simultaneously. + * Move callbacks are processed on a separate taskq so that client code + * does not interfere with internal maintenance tasks. + */ + kmem_move_taskq = taskq_create_instance("kmem_move_taskq", 0, 1, + minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE); +} + void kmem_thread_init(void) { + kmem_move_init(); kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri, 300, INT_MAX, TASKQ_PREPOPULATE); + kmem_cache_applyall(kmem_check_destructor, kmem_move_taskq, + TQ_NOSLEEP); } void @@ -2676,3 +3983,934 @@ kmem_mp_init(void) kmem_update_timeout(NULL); } + +/* + * Return the slab of the allocated buffer, or NULL if the buffer is not + * allocated. This function may be called with a known slab address to determine + * whether or not the buffer is allocated, or with a NULL slab address to obtain + * an allocated buffer's slab. + */ +static kmem_slab_t * +kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf) +{ + kmem_bufctl_t *bcp, *bufbcp; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf)); + + if (cp->cache_flags & KMF_HASH) { + for (bcp = *KMEM_HASH(cp, buf); + (bcp != NULL) && (bcp->bc_addr != buf); + bcp = bcp->bc_next) { + continue; + } + ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1); + return (bcp == NULL ? NULL : bcp->bc_slab); + } + + if (sp == NULL) { + sp = KMEM_SLAB(cp, buf); + } + bufbcp = KMEM_BUFCTL(cp, buf); + for (bcp = sp->slab_head; + (bcp != NULL) && (bcp != bufbcp); + bcp = bcp->bc_next) { + continue; + } + return (bcp == NULL ? sp : NULL); +} + +static boolean_t +kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) +{ + long refcnt; + + ASSERT(cp->cache_defrag != NULL); + + /* If we're desperate, we don't care if the client said NO. */ + refcnt = sp->slab_refcnt; + if (flags & KMM_DESPERATE) { + return (refcnt < sp->slab_chunks); /* any partial */ + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + return (B_FALSE); + } + + if (kmem_move_any_partial) { + return (refcnt < sp->slab_chunks); + } + + if ((refcnt == 1) && (refcnt < sp->slab_chunks)) { + return (B_TRUE); + } + + /* + * The reclaim threshold is adjusted at each kmem_cache_scan() so that + * slabs with a progressively higher percentage of used buffers can be + * reclaimed until the cache as a whole is no longer fragmented. + * + * sp->slab_refcnt kmd_reclaim_numer + * --------------- < ------------------ + * sp->slab_chunks KMEM_VOID_FRACTION + */ + return ((refcnt * KMEM_VOID_FRACTION) < + (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); +} + +static void * +kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, + void *tbuf) +{ + int i; /* magazine round index */ + + for (i = 0; i < n; i++) { + if (buf == m->mag_round[i]) { + if (cp->cache_flags & KMF_BUFTAG) { + (void) kmem_cache_free_debug(cp, tbuf, + caller()); + } + m->mag_round[i] = tbuf; + return (buf); + } + } + + return (NULL); +} + +/* + * Hunt the magazine layer for the given buffer. If found, the buffer is + * removed from the magazine layer and returned, otherwise NULL is returned. + * The state of the returned buffer is freed and constructed. + */ +static void * +kmem_hunt_mags(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp; + kmem_magazine_t *m; + int cpu_seqid; + int n; /* magazine rounds */ + void *tbuf; /* temporary swap buffer */ + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + /* + * Allocated a buffer to swap with the one we hope to pull out of a + * magazine when found. + */ + tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); + if (tbuf == NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); + return (NULL); + } + if (tbuf == buf) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); + if (cp->cache_flags & KMF_BUFTAG) { + (void) kmem_cache_free_debug(cp, buf, caller()); + } + return (buf); + } + + /* Hunt the depot. */ + mutex_enter(&cp->cache_depot_lock); + n = cp->cache_magtype->mt_magsize; + for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&cp->cache_depot_lock); + return (buf); + } + } + mutex_exit(&cp->cache_depot_lock); + + /* Hunt the per-CPU magazines. */ + for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + m = ccp->cc_loaded; + n = ccp->cc_rounds; + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + m = ccp->cc_ploaded; + n = ccp->cc_prounds; + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + mutex_exit(&ccp->cc_lock); + } + + kmem_cache_free(cp, tbuf); + return (NULL); +} + +/* + * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), + * or when the buffer is freed. + */ +static void +kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) { + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + avl_add(&cp->cache_partial_slabs, sp); + } + } else { + sp->slab_later_count = 0; + sp->slab_stuck_offset = (uint32_t)-1; + } +} + +static void +kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; + sp->slab_flags |= KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf); + avl_add(&cp->cache_partial_slabs, sp); +} + +static void kmem_move_end(kmem_cache_t *, kmem_move_t *); + +/* + * The move callback takes two buffer addresses, the buffer to be moved, and a + * newly allocated and constructed buffer selected by kmem as the destination. + * It also takes the size of the buffer and an optional user argument specified + * at cache creation time. kmem guarantees that the buffer to be moved has not + * been unmapped by the virtual memory subsystem. Beyond that, it cannot + * guarantee the present whereabouts of the buffer to be moved, so it is up to + * the client to safely determine whether or not it is still using the buffer. + * The client must not free either of the buffers passed to the move callback, + * since kmem wants to free them directly to the slab layer. The client response + * tells kmem which of the two buffers to free: + * + * YES kmem frees the old buffer (the move was successful) + * NO kmem frees the new buffer, marks the slab of the old buffer + * non-reclaimable to avoid bothering the client again + * LATER kmem frees the new buffer, increments slab_later_count + * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer + * DONT_NEED kmem frees both the old buffer and the new buffer + * + * The pending callback argument now being processed contains both of the + * buffers (old and new) passed to the move callback function, the slab of the + * old buffer, and flags related to the move request, such as whether or not the + * system was desperate for memory. + */ +static void +kmem_move_buffer(kmem_move_t *callback) +{ + kmem_cbrc_t response; + kmem_slab_t *sp = callback->kmm_from_slab; + kmem_cache_t *cp = sp->slab_cache; + boolean_t free_on_slab; + + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf)); + + /* + * The number of allocated buffers on the slab may have changed since we + * last checked the slab's reclaimability (when the pending move was + * enqueued), or the client may have responded NO when asked to move + * another buffer on the same slab. + */ + if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { + KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); + KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), + kmem_move_stats.kms_notify_no_longer_reclaimable); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + /* + * Hunting magazines is expensive, so we'll wait to do that until the + * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer + * is cheap, so we might as well do that here in case we can avoid + * bothering the client. + */ + mutex_enter(&cp->cache_lock); + free_on_slab = (kmem_slab_allocated(cp, sp, + callback->kmm_from_buf) == NULL); + mutex_exit(&cp->cache_lock); + + if (free_on_slab) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, + KM_NOSLEEP, 1, caller()) != 0) { + KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail); + kmem_move_end(cp, callback); + return; + } + } else if (cp->cache_constructor != NULL && + cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, + KM_NOSLEEP) != 0) { + atomic_add_64(&cp->cache_alloc_fail, 1); + KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); + KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), + kmem_move_stats.kms_notify_callbacks); + cp->cache_defrag->kmd_callbacks++; + cp->cache_defrag->kmd_thread = curthread; + cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; + cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf; + DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *, + callback); + + response = cp->cache_move(callback->kmm_from_buf, + callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private); + + DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *, + callback, kmem_cbrc_t, response); + cp->cache_defrag->kmd_thread = NULL; + cp->cache_defrag->kmd_from_buf = NULL; + cp->cache_defrag->kmd_to_buf = NULL; + + if (response == KMEM_CBRC_YES) { + KMEM_STAT_ADD(kmem_move_stats.kms_yes); + cp->cache_defrag->kmd_yes++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + kmem_move_end(cp, callback); + return; + } + + switch (response) { + case KMEM_CBRC_NO: + KMEM_STAT_ADD(kmem_move_stats.kms_no); + cp->cache_defrag->kmd_no++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_LATER: + KMEM_STAT_ADD(kmem_move_stats.kms_later); + cp->cache_defrag->kmd_later++; + mutex_enter(&cp->cache_lock); + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + mutex_exit(&cp->cache_lock); + break; + } + + if (++sp->slab_later_count >= KMEM_DISBELIEF) { + KMEM_STAT_ADD(kmem_move_stats.kms_disbelief); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, + callback->kmm_from_buf); + } + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_NEED: + KMEM_STAT_ADD(kmem_move_stats.kms_dont_need); + cp->cache_defrag->kmd_dont_need++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_KNOW: + KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); + cp->cache_defrag->kmd_dont_know++; + if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); + cp->cache_defrag->kmd_hunt_found++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, + B_TRUE); + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + } else { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_notfound); + } + break; + default: + panic("'%s' (%p) unexpected move callback response %d\n", + cp->cache_name, (void *)cp, response); + } + + kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE); + kmem_move_end(cp, callback); +} + +/* Return B_FALSE if there is insufficient memory for the move request. */ +static boolean_t +kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) +{ + void *to_buf; + avl_index_t index; + kmem_move_t *callback, *pending; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + + callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); + if (callback == NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + return (B_FALSE); + } + + callback->kmm_from_slab = sp; + callback->kmm_from_buf = buf; + callback->kmm_flags = flags; + + mutex_enter(&cp->cache_lock); + + if (avl_numnodes(&cp->cache_partial_slabs) <= 1) { + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); /* there is no need for the move request */ + } + + pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index); + if (pending != NULL) { + /* + * If the move is already pending and we're desperate now, + * update the move flags. + */ + if (flags & KMM_DESPERATE) { + pending->kmm_flags |= KMM_DESPERATE; + } + mutex_exit(&cp->cache_lock); + KMEM_STAT_ADD(kmem_move_stats.kms_already_pending); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); + } + + to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs)); + callback->kmm_to_buf = to_buf; + avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index); + + mutex_exit(&cp->cache_lock); + + if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, + callback, TQ_NOSLEEP)) { + mutex_enter(&cp->cache_lock); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + mutex_exit(&cp->cache_lock); + kmem_slab_free_constructed(cp, to_buf, B_FALSE); + kmem_cache_free(kmem_move_cache, callback); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) +{ + avl_index_t index; + + ASSERT(cp->cache_defrag != NULL); + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + mutex_enter(&cp->cache_lock); + VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending, + callback->kmm_from_buf, &index) != NULL); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + kmem_slab_t *sp; + + /* + * The last pending move completed. Release all slabs from the + * front of the dead list except for any slab at the tail that + * needs to be released from the context of kmem_move_buffers(). + * kmem deferred unmapping the buffers on these slabs in order + * to guarantee that buffers passed to the move callback have + * been touched only by kmem or by the client itself. + */ + while ((sp = list_remove_head(deadlist)) != NULL) { + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + break; + } + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); + mutex_enter(&cp->cache_lock); + } + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); +} + +/* + * Move buffers from least used slabs first by scanning backwards from the end + * of the partial slab list. Scan at most max_scan candidate slabs and move + * buffers from at most max_slabs slabs (0 for all partial slabs in both cases). + * If desperate to reclaim memory, move buffers from any partial slab, otherwise + * skip slabs with a ratio of allocated buffers at or above the current + * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the + * scan is aborted) so that the caller can adjust the reclaimability threshold + * depending on how many reclaimable slabs it finds. + * + * kmem_move_buffers() drops and reacquires cache_lock every time it issues a + * move request, since it is not valid for kmem_move_begin() to call + * kmem_cache_alloc() or taskq_dispatch() with cache_lock held. + */ +static int +kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs, + int flags) +{ + kmem_slab_t *sp; + void *buf; + int i, j; /* slab index, buffer index */ + int s; /* reclaimable slabs */ + int b; /* allocated (movable) buffers on reclaimable slab */ + boolean_t success; + int refcnt; + int nomove; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(kmem_move_cache != NULL); + ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL); + ASSERT(avl_numnodes(&cp->cache_partial_slabs) > 1); + + if (kmem_move_blocked) { + return (0); + } + + if (kmem_move_fulltilt) { + max_slabs = 0; + flags |= KMM_DESPERATE; + } + + if (max_scan == 0 || (flags & KMM_DESPERATE)) { + /* + * Scan as many slabs as needed to find the desired number of + * candidate slabs. + */ + max_scan = (size_t)-1; + } + + if (max_slabs == 0 || (flags & KMM_DESPERATE)) { + /* Find as many candidate slabs as possible. */ + max_slabs = (size_t)-1; + } + + sp = avl_last(&cp->cache_partial_slabs); + ASSERT(sp != NULL && KMEM_SLAB_IS_PARTIAL(sp)); + for (i = 0, s = 0; (i < max_scan) && (s < max_slabs) && + (sp != avl_first(&cp->cache_partial_slabs)); + sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) { + + if (!kmem_slab_is_reclaimable(cp, sp, flags)) { + continue; + } + s++; + + /* Look for allocated buffers to move. */ + for (j = 0, b = 0, buf = sp->slab_base; + (j < sp->slab_chunks) && (b < sp->slab_refcnt); + buf = (((char *)buf) + cp->cache_chunksize), j++) { + + if (kmem_slab_allocated(cp, sp, buf) == NULL) { + continue; + } + + b++; + + /* + * Prevent the slab from being destroyed while we drop + * cache_lock and while the pending move is not yet + * registered. Flag the pending move while + * kmd_moves_pending may still be empty, since we can't + * yet rely on a non-zero pending move count to prevent + * the slab from being destroyed. + */ + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + /* + * Recheck refcnt and nomove after reacquiring the lock, + * since these control the order of partial slabs, and + * we want to know if we can pick up the scan where we + * left off. + */ + refcnt = sp->slab_refcnt; + nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE); + mutex_exit(&cp->cache_lock); + + success = kmem_move_begin(cp, sp, buf, flags); + + /* + * Now, before the lock is reacquired, kmem could + * process all pending move requests and purge the + * deadlist, so that upon reacquiring the lock, sp has + * been remapped. Therefore, the KMEM_SLAB_MOVE_PENDING + * flag causes the slab to be put at the end of the + * deadlist and prevents it from being purged, since we + * plan to destroy it here after reacquiring the lock. + */ + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + + /* + * Destroy the slab now if it was completely freed while + * we dropped cache_lock. + */ + if (sp->slab_refcnt == 0) { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + + ASSERT(!list_is_empty(deadlist)); + ASSERT(list_link_active((list_node_t *) + &sp->slab_link)); + + list_remove(deadlist, sp); + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats. + kms_dead_slabs_freed); + KMEM_STAT_ADD(kmem_move_stats. + kms_endscan_slab_destroyed); + mutex_enter(&cp->cache_lock); + /* + * Since we can't pick up the scan where we left + * off, abort the scan and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + if (!success) { + /* + * Abort the scan if there is not enough memory + * for the request and say nothing about the + * number of reclaimable slabs. + */ + KMEM_STAT_ADD( + kmem_move_stats.kms_endscan_nomem); + return (-1); + } + + /* + * The slab may have been completely allocated while the + * lock was dropped. + */ + if (KMEM_SLAB_IS_ALL_USED(sp)) { + KMEM_STAT_ADD( + kmem_move_stats.kms_endscan_slab_all_used); + return (-1); + } + + /* + * The slab's position changed while the lock was + * dropped, so we don't know where we are in the + * sequence any more. + */ + if (sp->slab_refcnt != refcnt) { + KMEM_STAT_ADD( + kmem_move_stats.kms_endscan_refcnt_changed); + return (-1); + } + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { + KMEM_STAT_ADD( + kmem_move_stats.kms_endscan_nomove_changed); + return (-1); + } + + /* + * Generating a move request allocates a destination + * buffer from the slab layer, bumping the first slab if + * it is completely allocated. + */ + ASSERT(!avl_is_empty(&cp->cache_partial_slabs)); + if (sp == avl_first(&cp->cache_partial_slabs)) { + goto end_scan; + } + } + } +end_scan: + + KMEM_STAT_COND_ADD(sp == avl_first(&cp->cache_partial_slabs), + kmem_move_stats.kms_endscan_freelist); + + return (s); +} + +typedef struct kmem_move_notify_args { + kmem_cache_t *kmna_cache; + void *kmna_buf; +} kmem_move_notify_args_t; + +static void +kmem_cache_move_notify_task(void *arg) +{ + kmem_move_notify_args_t *args = arg; + kmem_cache_t *cp = args->kmna_cache; + void *buf = args->kmna_buf; + kmem_slab_t *sp; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(list_link_active(&cp->cache_link)); + + kmem_free(args, sizeof (kmem_move_notify_args_t)); + mutex_enter(&cp->cache_lock); + sp = kmem_slab_allocated(cp, NULL, buf); + + /* Ignore the notification if the buffer is no longer allocated. */ + if (sp == NULL) { + mutex_exit(&cp->cache_lock); + return; + } + + /* Ignore the notification if there's no reason to move the buffer. */ + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + /* + * So far the notification is not ignored. Ignore the + * notification if the slab is not marked by an earlier refusal + * to move a buffer. + */ + if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) && + (sp->slab_later_count == 0)) { + mutex_exit(&cp->cache_lock); + return; + } + + kmem_slab_move_yes(cp, sp, buf); + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + mutex_exit(&cp->cache_lock); + /* see kmem_move_buffers() about dropping the lock */ + (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY); + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + if (sp->slab_refcnt == 0) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + + ASSERT(!list_is_empty(deadlist)); + ASSERT(list_link_active((list_node_t *) + &sp->slab_link)); + + list_remove(deadlist, sp); + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); + return; + } + } else { + kmem_slab_move_yes(cp, sp, buf); + } + mutex_exit(&cp->cache_lock); +} + +void +kmem_cache_move_notify(kmem_cache_t *cp, void *buf) +{ + kmem_move_notify_args_t *args; + + KMEM_STAT_ADD(kmem_move_stats.kms_notify); + args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); + if (args != NULL) { + args->kmna_cache = cp; + args->kmna_buf = buf; + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_move_notify_task, args, + TQ_NOSLEEP); + } +} + +static void +kmem_cache_defrag(kmem_cache_t *cp) +{ + size_t n; + + ASSERT(cp->cache_defrag != NULL); + + mutex_enter(&cp->cache_lock); + n = avl_numnodes(&cp->cache_partial_slabs); + if (n > 1) { + /* kmem_move_buffers() drops and reacquires cache_lock */ + (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); + KMEM_STAT_ADD(kmem_move_stats.kms_defrags); + } + mutex_exit(&cp->cache_lock); +} + +/* Is this cache above the fragmentation threshold? */ +static boolean_t +kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree) +{ + if (avl_numnodes(&cp->cache_partial_slabs) <= 1) + return (B_FALSE); + + /* + * nfree kmem_frag_numer + * ------------------ > --------------- + * cp->cache_buftotal kmem_frag_denom + */ + return ((nfree * kmem_frag_denom) > + (cp->cache_buftotal * kmem_frag_numer)); +} + +static boolean_t +kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap) +{ + boolean_t fragmented; + uint64_t nfree; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + *doreap = B_FALSE; + + if (!kmem_move_fulltilt && ((cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs)) < kmem_frag_minslabs)) + return (B_FALSE); + + nfree = cp->cache_bufslab; + fragmented = kmem_cache_frag_threshold(cp, nfree); + /* + * Free buffers in the magazine layer appear allocated from the point of + * view of the slab layer. We want to know if the slab layer would + * appear fragmented if we included free buffers from magazines that + * have fallen out of the working set. + */ + if (!fragmented) { + long reap; + + mutex_enter(&cp->cache_depot_lock); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + mutex_exit(&cp->cache_depot_lock); + + nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + if (kmem_cache_frag_threshold(cp, nfree)) { + *doreap = B_TRUE; + } + } + + return (fragmented); +} + +/* Called periodically from kmem_taskq */ +static void +kmem_cache_scan(kmem_cache_t *cp) +{ + boolean_t reap = B_FALSE; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(cp->cache_defrag != NULL); + + mutex_enter(&cp->cache_lock); + + if (kmem_cache_is_fragmented(cp, &reap)) { + kmem_defrag_t *kmd = cp->cache_defrag; + size_t slabs_found; + + /* + * Consolidate reclaimable slabs from the end of the partial + * slab list (scan at most kmem_reclaim_scan_range slabs to find + * reclaimable slabs). Keep track of how many candidate slabs we + * looked for and how many we actually found so we can adjust + * the definition of a candidate slab if we're having trouble + * finding them. + * + * kmem_move_buffers() drops and reacquires cache_lock. + */ + slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, + kmem_reclaim_max_slabs, 0); + if (slabs_found >= 0) { + kmd->kmd_slabs_sought += kmem_reclaim_max_slabs; + kmd->kmd_slabs_found += slabs_found; + } + + if (++kmd->kmd_scans >= kmem_reclaim_scan_range) { + kmd->kmd_scans = 0; + + /* + * If we had difficulty finding candidate slabs in + * previous scans, adjust the threshold so that + * candidates are easier to find. + */ + if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, -1); + } else if ((kmd->kmd_slabs_found * 2) < + kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, 1); + } + kmd->kmd_slabs_sought = 0; + kmd->kmd_slabs_found = 0; + } + } else { + kmem_reset_reclaim_threshold(cp->cache_defrag); +#ifdef DEBUG + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + /* + * In a debug kernel we want the consolidator to + * run occasionally even when there is plenty of + * memory. + */ + uint32_t debug_rand; + + (void) random_get_bytes((uint8_t *)&debug_rand, 4); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + mutex_exit(&cp->cache_lock); + kmem_cache_reap(cp); + KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps); + return; + } else if ((debug_rand % kmem_mtb_move) == 0) { + (void) kmem_move_buffers(cp, + kmem_reclaim_scan_range, 1, 0); + KMEM_STAT_ADD(kmem_move_stats. + kms_debug_move_scans); + } + } +#endif /* DEBUG */ + } + + mutex_exit(&cp->cache_lock); + + if (reap) { + KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); + kmem_depot_ws_reap(cp); + } +} diff --git a/usr/src/uts/common/os/list.c b/usr/src/uts/common/os/list.c index 0288c580e8..e8db13a5cf 100644 --- a/usr/src/uts/common/os/list.c +++ b/usr/src/uts/common/os/list.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,20 +41,25 @@ #define list_insert_after_node(list, node, object) { \ list_node_t *lnew = list_d2l(list, object); \ - lnew->list_prev = node; \ - lnew->list_next = node->list_next; \ - node->list_next->list_prev = lnew; \ - node->list_next = lnew; \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ } #define list_insert_before_node(list, node, object) { \ list_node_t *lnew = list_d2l(list, object); \ - lnew->list_next = node; \ - lnew->list_prev = node->list_prev; \ - node->list_prev->list_next = lnew; \ - node->list_prev = lnew; \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ } +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + void list_create(list_t *list, size_t size, size_t offset) { @@ -83,15 +88,23 @@ list_destroy(list_t *list) void list_insert_after(list_t *list, void *object, void *nobject) { - list_node_t *lold = list_d2l(list, object); - list_insert_after_node(list, lold, nobject); + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } } void list_insert_before(list_t *list, void *object, void *nobject) { - list_node_t *lold = list_d2l(list, object); - list_insert_before_node(list, lold, nobject) + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } } void @@ -114,9 +127,27 @@ list_remove(list_t *list, void *object) list_node_t *lold = list_d2l(list, object); ASSERT(!list_empty(list)); ASSERT(lold->list_next != NULL); - lold->list_prev->list_next = lold->list_next; - lold->list_next->list_prev = lold->list_prev; - lold->list_next = lold->list_prev = NULL; + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); } void * @@ -181,6 +212,26 @@ list_move_tail(list_t *dst, list_t *src) srcnode->list_next = srcnode->list_prev = srcnode; } +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + int list_link_active(list_node_t *link) { diff --git a/usr/src/uts/common/os/mutex.c b/usr/src/uts/common/os/mutex.c index a6a19c869e..f812aeb6bc 100644 --- a/usr/src/uts/common/os/mutex.c +++ b/usr/src/uts/common/os/mutex.c @@ -529,9 +529,9 @@ mutex_vector_exit(mutex_impl_t *lp) } int -mutex_owned(kmutex_t *mp) +mutex_owned(const kmutex_t *mp) { - mutex_impl_t *lp = (mutex_impl_t *)mp; + const mutex_impl_t *lp = (const mutex_impl_t *)mp; if (panicstr) return (1); @@ -542,9 +542,9 @@ mutex_owned(kmutex_t *mp) } kthread_t * -mutex_owner(kmutex_t *mp) +mutex_owner(const kmutex_t *mp) { - mutex_impl_t *lp = (mutex_impl_t *)mp; + const mutex_impl_t *lp = (const mutex_impl_t *)mp; kthread_id_t t; if (MUTEX_TYPE_ADAPTIVE(lp) && (t = MUTEX_OWNER(lp)) != MUTEX_NO_OWNER) diff --git a/usr/src/uts/common/sys/avl.h b/usr/src/uts/common/sys/avl.h index bf9af8948a..02263a5a0c 100644 --- a/usr/src/uts/common/sys/avl.h +++ b/usr/src/uts/common/sys/avl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +37,7 @@ extern "C" { #endif +#include <sys/types.h> #include <sys/avl_impl.h> /* @@ -128,7 +128,6 @@ typedef uintptr_t avl_index_t; #define AVL_AFTER (1) - /* * Prototypes * @@ -182,7 +181,7 @@ extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); * data to avoid doing avl_find() again for insertion. * * new_data - new data to insert - * here - existing node in "tree" + * here - existing node in "tree" * direction - either AVL_AFTER or AVL_BEFORE the data "here". */ extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, @@ -251,12 +250,26 @@ extern void avl_add(avl_tree_t *tree, void *node); */ extern void avl_remove(avl_tree_t *tree, void *node); +/* + * Reinsert a node only if its order has changed relative to its nearest + * neighbors. To optimize performance avl_update_lt() checks only the previous + * node and avl_update_gt() checks only the next node. Use avl_update_lt() and + * avl_update_gt() only if you know the direction in which the order of the + * node may change. + */ +extern boolean_t avl_update(avl_tree_t *, void *); +extern boolean_t avl_update_lt(avl_tree_t *, void *); +extern boolean_t avl_update_gt(avl_tree_t *, void *); /* * Return the number of nodes in the tree */ extern ulong_t avl_numnodes(avl_tree_t *tree); +/* + * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. + */ +extern boolean_t avl_is_empty(avl_tree_t *tree); /* * Used to destroy any remaining nodes in a tree. The cookie argument should diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h index 097e92f2e5..7ed4c84005 100644 --- a/usr/src/uts/common/sys/kmem.h +++ b/usr/src/uts/common/sys/kmem.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -82,6 +82,15 @@ struct kmem_cache; /* cache structure is opaque to kmem clients */ typedef struct kmem_cache kmem_cache_t; +/* Client response to kmem move callback */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + #ifdef _KERNEL extern int kmem_ready; @@ -99,11 +108,14 @@ extern size_t kmem_maxavail(void); extern kmem_cache_t *kmem_cache_create(char *, size_t, size_t, int (*)(void *, void *, int), void (*)(void *, void *), void (*)(void *), void *, vmem_t *, int); +extern void kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void kmem_cache_destroy(kmem_cache_t *); extern void *kmem_cache_alloc(kmem_cache_t *, int); extern void kmem_cache_free(kmem_cache_t *, void *); extern uint64_t kmem_cache_stat(kmem_cache_t *, char *); extern void kmem_cache_reap_now(kmem_cache_t *); +extern void kmem_cache_move_notify(kmem_cache_t *, void *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/kmem_impl.h b/usr/src/uts/common/sys/kmem_impl.h index 01d15640bb..96b88261b7 100644 --- a/usr/src/uts/common/sys/kmem_impl.h +++ b/usr/src/uts/common/sys/kmem_impl.h @@ -38,6 +38,8 @@ #include <sys/cpuvar.h> #include <sys/systm.h> #include <vm/page.h> +#include <sys/avl.h> +#include <sys/list.h> #ifdef __cplusplus extern "C" { @@ -45,6 +47,14 @@ extern "C" { /* * kernel memory allocator: implementation-private data structures + * + * Lock order: + * 1. cache_lock + * 2. cc_lock in order by CPU ID + * 3. cache_depot_lock + * + * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the + * above locks. */ #define KMF_AUDIT 0x00000001 /* transaction auditing */ @@ -82,6 +92,16 @@ extern "C" { #define KMEM_SIZE_DECODE(x) ((x) / 251) #define KMEM_SIZE_VALID(x) ((x) % 251 == 1 && (x) != 1) + +#define KMEM_ALIGN 8 /* min guaranteed alignment */ +#define KMEM_ALIGN_SHIFT 3 /* log2(KMEM_ALIGN) */ +#define KMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ + +#define KMEM_SLAB_IS_PARTIAL(sp) \ + ((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks) +#define KMEM_SLAB_IS_ALL_USED(sp) \ + ((sp)->slab_refcnt == (sp)->slab_chunks) + /* * The bufctl (buffer control) structure keeps some minimal information * about each buffer: its address, its slab, and its current linkage, @@ -159,21 +179,32 @@ typedef struct kmem_buftag_lite { (((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ (cp)->cache_magtype->mt_cache) +#define KMEM_SLAB_OFFSET(sp, buf) \ + ((size_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base))) + #define KMEM_SLAB_MEMBER(sp, buf) \ - ((size_t)(buf) - (size_t)(sp)->slab_base < \ - (sp)->slab_cache->cache_slabsize) + (KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize) #define KMEM_BUFTAG_ALLOC 0xa110c8edUL #define KMEM_BUFTAG_FREE 0xf4eef4eeUL +/* slab_later_count thresholds */ +#define KMEM_DISBELIEF 3 + +/* slab_flags */ +#define KMEM_SLAB_NOMOVE 0x1 +#define KMEM_SLAB_MOVE_PENDING 0x2 + typedef struct kmem_slab { struct kmem_cache *slab_cache; /* controlling cache */ void *slab_base; /* base of allocated memory */ - struct kmem_slab *slab_next; /* next slab on freelist */ - struct kmem_slab *slab_prev; /* prev slab on freelist */ + avl_node_t slab_link; /* slab linkage */ struct kmem_bufctl *slab_head; /* first free buffer */ long slab_refcnt; /* outstanding allocations */ long slab_chunks; /* chunks (bufs) in this slab */ + uint32_t slab_stuck_offset; /* unmoved buffer offset */ + uint16_t slab_later_count; /* cf KMEM_CBRC_LATER */ + uint16_t slab_flags; /* bits to mark the slab */ } kmem_slab_t; #define KMEM_HASH_INITIAL 64 @@ -228,6 +259,38 @@ typedef struct kmem_maglist { uint64_t ml_alloc; /* allocations from this list */ } kmem_maglist_t; +typedef struct kmem_defrag { + /* + * Statistics + */ + uint64_t kmd_callbacks; /* move callbacks */ + uint64_t kmd_yes; /* KMEM_CBRC_YES responses */ + uint64_t kmd_no; /* NO responses */ + uint64_t kmd_later; /* LATER responses */ + uint64_t kmd_dont_need; /* DONT_NEED responses */ + uint64_t kmd_dont_know; /* DONT_KNOW responses */ + uint64_t kmd_hunt_found; /* DONT_KNOW: # found in mag */ + + /* + * Consolidator fields + */ + avl_tree_t kmd_moves_pending; /* buffer moves pending */ + list_t kmd_deadlist; /* deferred slab frees */ + size_t kmd_deadcount; /* # of slabs in kmd_deadlist */ + uint8_t kmd_reclaim_numer; /* slab usage threshold */ + uint8_t kmd_pad1; /* compiler padding */ + size_t kmd_slabs_sought; /* reclaimable slabs sought */ + size_t kmd_slabs_found; /* reclaimable slabs found */ + size_t kmd_scans; /* nth scan interval counter */ + /* + * Fields used to ASSERT that the client does not kmem_cache_free() + * objects passed to the move callback. + */ + void *kmd_from_buf; /* object to move */ + void *kmd_to_buf; /* move destination */ + kthread_t *kmd_thread; /* thread calling move */ +} kmem_defrag_t; + #define KMEM_CACHE_NAMELEN 31 struct kmem_cache { @@ -256,15 +319,15 @@ struct kmem_cache { int (*cache_constructor)(void *, void *, int); void (*cache_destructor)(void *, void *); void (*cache_reclaim)(void *); + kmem_cbrc_t (*cache_move)(void *, void *, size_t, void *); void *cache_private; /* opaque arg to callbacks */ vmem_t *cache_arena; /* vmem source for slabs */ int cache_cflags; /* cache creation flags */ int cache_flags; /* various cache state info */ uint32_t cache_mtbf; /* induced alloc failure rate */ - uint32_t cache_pad1; /* to align cache_lock */ + uint32_t cache_pad1; /* compiler padding */ kstat_t *cache_kstat; /* exported statistics */ - kmem_cache_t *cache_next; /* forward cache linkage */ - kmem_cache_t *cache_prev; /* backward cache linkage */ + list_node_t cache_link; /* cache linkage */ /* * Slab layer @@ -272,6 +335,7 @@ struct kmem_cache { kmutex_t cache_lock; /* protects slab layer */ size_t cache_chunksize; /* buf + alignment [+ debug] */ size_t cache_slabsize; /* size of a slab */ + size_t cache_maxchunks; /* max buffers per slab */ size_t cache_bufctl; /* buf-to-bufctl distance */ size_t cache_buftag; /* buf-to-buftag distance */ size_t cache_verify; /* bytes to verify */ @@ -281,18 +345,19 @@ struct kmem_cache { size_t cache_maxcolor; /* maximum slab color */ size_t cache_hash_shift; /* get to interesting bits */ size_t cache_hash_mask; /* hash table mask */ - kmem_slab_t *cache_freelist; /* slab free list */ - kmem_slab_t cache_nullslab; /* end of freelist marker */ + list_t cache_complete_slabs; /* completely allocated slabs */ + size_t cache_complete_slab_count; + avl_tree_t cache_partial_slabs; /* partial slab freelist */ + size_t cache_partial_binshift; /* for AVL sort bins */ kmem_cache_t *cache_bufctl_cache; /* source of bufctls */ kmem_bufctl_t **cache_hash_table; /* hash table base */ - void *cache_pad2; /* to align depot_lock */ + kmem_defrag_t *cache_defrag; /* slab consolidator fields */ /* * Depot layer */ kmutex_t cache_depot_lock; /* protects depot */ kmem_magtype_t *cache_magtype; /* magazine type */ - void *cache_pad3; /* to align cache_cpu */ kmem_maglist_t cache_full; /* full magazines */ kmem_maglist_t cache_empty; /* empty magazines */ @@ -324,9 +389,24 @@ typedef struct kmem_log_header { kmem_cpu_log_header_t lh_cpu[1]; /* ncpus actually allocated */ } kmem_log_header_t; -#define KMEM_ALIGN 8 /* min guaranteed alignment */ -#define KMEM_ALIGN_SHIFT 3 /* log2(KMEM_ALIGN) */ -#define KMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ +/* kmem_move kmm_flags */ +#define KMM_DESPERATE 0x1 +#define KMM_NOTIFY 0x2 + +typedef struct kmem_move { + kmem_slab_t *kmm_from_slab; + void *kmm_from_buf; + void *kmm_to_buf; + avl_node_t kmm_entry; + int kmm_flags; +} kmem_move_t; + +/* + * In order to consolidate partial slabs, it must be possible for the cache to + * have partial slabs. + */ +#define KMEM_IS_MOVABLE(cp) \ + (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize) #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/list.h b/usr/src/uts/common/sys/list.h index 7e9d9aaaf7..8339b6226d 100644 --- a/usr/src/uts/common/sys/list.h +++ b/usr/src/uts/common/sys/list.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,15 +45,20 @@ void list_insert_before(list_t *, void *, void *); void list_insert_head(list_t *, void *); void list_insert_tail(list_t *, void *); void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); void list_move_tail(list_t *, list_t *); void *list_head(list_t *); void *list_tail(list_t *); void *list_next(list_t *, void *); void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); int list_link_active(list_node_t *); -int list_is_empty(list_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/mutex.h b/usr/src/uts/common/sys/mutex.h index 7cd1b5b72a..b271efced3 100644 --- a/usr/src/uts/common/sys/mutex.h +++ b/usr/src/uts/common/sys/mutex.h @@ -81,8 +81,8 @@ extern void mutex_destroy(kmutex_t *); extern void mutex_enter(kmutex_t *); extern int mutex_tryenter(kmutex_t *); extern void mutex_exit(kmutex_t *); -extern int mutex_owned(kmutex_t *); -extern struct _kthread *mutex_owner(kmutex_t *); +extern int mutex_owned(const kmutex_t *); +extern struct _kthread *mutex_owner(const kmutex_t *); extern ushort_t mutex_backoff_base; extern uint_t mutex_backoff_cap; diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 6381fd9852..c5e458513c 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -239,10 +239,6 @@ typedef struct vnode { struct vfs *v_vfsmountedhere; /* ptr to vfs mounted here */ struct vnodeops *v_op; /* vnode operations */ struct page *v_pages; /* vnode pages list */ - pgcnt_t v_npages; /* # pages on this vnode */ - pgcnt_t v_msnpages; /* # pages charged to v_mset */ - struct page *v_scanfront; /* scanner front hand */ - struct page *v_scanback; /* scanner back hand */ struct filock *v_filocks; /* ptr to filock list */ struct shrlocklist *v_shrlocks; /* ptr to shrlock list */ krwlock_t v_nbllock; /* sync for NBMAND locks */ @@ -255,15 +251,10 @@ typedef struct vnode { u_longlong_t v_mmap_read; /* mmap read count */ u_longlong_t v_mmap_write; /* mmap write count */ void *v_mpssdata; /* info for large page mappings */ - hrtime_t v_scantime; /* last time this vnode was scanned */ - ushort_t v_mset; /* memory set ID */ - uint_t v_msflags; /* memory set flags */ - struct vnode *v_msnext; /* list of vnodes on an mset */ - struct vnode *v_msprev; /* list of vnodes on an mset */ - krwlock_t v_mslock; /* protects v_mset */ void *v_fopdata; /* list of file ops event watches */ struct vsd_node *v_vsd; /* vnode specific data */ struct vnode *v_xattrdir; /* unnamed extended attr dir (GFS) */ + uint_t v_count_dnlc; /* dnlc reference count */ } vnode_t; #define IS_DEVVP(vp) \ @@ -1187,6 +1178,8 @@ int vn_ismntpt(vnode_t *); struct vfs *vn_mountedvfs(vnode_t *); +int vn_in_dnlc(vnode_t *); + void vn_create_cache(void); void vn_destroy_cache(void); @@ -1207,6 +1200,7 @@ int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, cred_t *cr, ssize_t *residp); void vn_rele(struct vnode *vp); +void vn_rele_dnlc(struct vnode *vp); void vn_rele_stream(struct vnode *vp); int vn_link(char *from, char *to, enum uio_seg seg); int vn_rename(char *from, char *to, enum uio_seg seg); |