6778289 vm locks need to scale with the size of system (strands/memory size)

author: Peter Rival <Frank.Rival@oracle.com> 2010-04-23 13:26:05 -0400
committer: Peter Rival <Frank.Rival@oracle.com> 2010-04-23 13:26:05 -0400
commit: cb15d5d96b3b2730714c28bfe06cfe7421758b8c (patch)
tree: 7fd5c3cf5bb49647be8b2eb022e8d75a7d78eab5
parent: 03c76a6ef5c04e818b6badeeb6155961505af45c (diff)
download: illumos-joyent-cb15d5d96b3b2730714c28bfe06cfe7421758b8c.tar.gz
21 files changed, 265 insertions, 117 deletions
diff --git a/usr/src/cmd/mdb/common/modules/mdb_ks/mdb_ks.c b/usr/src/cmd/mdb/common/modules/mdb_ks/mdb_ks.c
index c400b16ef3..ea9e747ba7 100644
--- a/usr/src/cmd/mdb/common/modules/mdb_ks/mdb_ks.c
+++ b/usr/src/cmd/mdb/common/modules/mdb_ks/mdb_ks.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -53,6 +52,7 @@
 #include <sys/cpuvar.h>
 #include <sys/dlpi.h>
 #include <sys/clock_impl.h>
+#include <sys/swap.h>
 #include <errno.h>
 
 #include <vm/seg_vn.h>
@@ -628,9 +628,11 @@ uintptr_t
 mdb_page_lookup(uintptr_t vp, u_offset_t offset)
 {
 	long page_hashsz, ndx;
+	int page_hashsz_shift;	/* Needed for PAGE_HASH_FUNC */
 	uintptr_t page_hash, pp;
 
 	if (mdb_readvar(&page_hashsz, "page_hashsz") == -1 ||
+	    mdb_readvar(&page_hashsz_shift, "page_hashsz_shift") == -1 ||
 	    mdb_readvar(&page_hash, "page_hash") == -1)
 		return (NULL);
 
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index bb0cb4de1e..f13030b4cb 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -114,6 +114,7 @@ const unsigned int	_maxslp		= (unsigned int)MAXSLP;
 const unsigned long	_maxhandspreadpages = (unsigned long)MAXHANDSPREADPAGES;
 const int		_ncpu 		= (int)NCPU;
 const int		_ncpu_log2	= (int)NCPU_LOG2;
+const int		_ncpu_p2	= (int)NCPU_P2;
 const unsigned long	_defaultstksz	= (unsigned long)DEFAULTSTKSZ;
 const unsigned int	_nbpg		= (unsigned int)MMU_PAGESIZE;
 
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index f1a69479b8..8ba373ad19 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -2284,8 +2283,11 @@ vn_cache_destructor(void *buf, void *cdrarg)
 void
 vn_create_cache(void)
 {
-	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode), 64,
-	    vn_cache_constructor, vn_cache_destructor, NULL, NULL,
+	/* LINTED */
+	ASSERT((1 << VNODE_ALIGN_LOG2) ==
+	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
+	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
+	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
 	    NULL, 0);
 }
 
diff --git a/usr/src/uts/common/sys/mutex.h b/usr/src/uts/common/sys/mutex.h
index 5c11b4afe0..db34243dcc 100644
--- a/usr/src/uts/common/sys/mutex.h
+++ b/usr/src/uts/common/sys/mutex.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_MUTEX_H
@@ -71,6 +70,18 @@ typedef struct mutex {
 
 #ifdef _KERNEL
 
+/*
+ * A padded mutex, one per 64 byte cache line.  Use when false sharing is
+ * an issue but beware of the extra memory it uses.  Consumers may want to
+ * consider aligning their pad_mutex_t's to a cache line boundary as well.
+ */
+typedef struct pad_mutex {
+	kmutex_t	pad_mutex;
+#ifdef _LP64
+	char		pad_pad[64 - sizeof (kmutex_t)];
+#endif
+} pad_mutex_t;
+
 #define	MUTEX_HELD(x)		(mutex_owned(x))
 #define	MUTEX_NOT_HELD(x)	(!mutex_owned(x) || panicstr || quiesce_active)
 
diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h
index 40af8ce04d..e795131fe4 100644
--- a/usr/src/uts/common/sys/param.h
+++ b/usr/src/uts/common/sys/param.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -380,6 +380,7 @@ extern const unsigned long _defaultstksz;
 extern const unsigned int _nbpg;
 extern const int _ncpu;
 extern const int _ncpu_log2;
+extern const int _ncpu_p2;
 extern const int _clsize;
 #endif	/* defined(_KERNEL) && !defined(_ASM) */
 
@@ -399,6 +400,7 @@ extern const int _clsize;
 #define	DEFAULTSTKSZ	_defaultstksz
 #define	NCPU		_ncpu
 #define	NCPU_LOG2	_ncpu_log2
+#define	NCPU_P2		_ncpu_p2
 
 #endif	/* defined(_MACHDEP) */
 
diff --git a/usr/src/uts/common/sys/swap.h b/usr/src/uts/common/sys/swap.h
index 9bba487ec1..bfe5fe3349 100644
--- a/usr/src/uts/common/sys/swap.h
+++ b/usr/src/uts/common/sys/swap.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -40,8 +38,6 @@
 #ifndef	_SYS_SWAP_H
 #define	_SYS_SWAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/isa_defs.h>
 #include <sys/feature_tests.h>
 #include <vm/anon.h>
@@ -159,23 +155,17 @@ struct	swapinfo {
 /*
  * Stuff to convert an anon slot pointer to a page name.
  * Because the address of the slot (ap) is a unique identifier, we
- * use it to generate a unique (vp,off), as shown below.
- *
- *  	|<-- 11 bits -->|<------32 - 11 --------->|
- *	   vp index bits	off bits
+ * use it to generate a unique (vp,off), as shown in the comment for
+ * swap_alloc().
  *
  * The off bits are shifted PAGESHIFT to directly form a page aligned
  * offset; the vp index bits map 1-1 to a vnode.
  *
- * Note: if we go to 64 bit offsets, we could use all the bits as the
- * unique offset and just have one vnode.
  */
-#define	AN_OFFSHIFT	11			/* vnum # bits */
-#define	AN_VPSHIFT	21 			/* 32 - 11 */
-#define	AN_VPSIZEMASK	0x7FF			/* vp index mask */
-#define	MAX_SWAP_VNODES	2048			/* 1 << AN_OFFSHIFT */
-#define	AN_CACHE_ALIGN	16			/* anon address aligned */
-						/* 16 bytes */
+#define	MAX_SWAP_VNODES_LOG2	11		/* log2(MAX_SWAP_VNODES) */
+#define	MAX_SWAP_VNODES	(1U << MAX_SWAP_VNODES_LOG2)	/* max # swap vnodes */
+#define	AN_VPMASK	(MAX_SWAP_VNODES - 1)	/* vp index mask */
+#define	AN_VPSHIFT	MAX_SWAP_VNODES_LOG2
 /*
  * Convert from an anon slot to associated vnode and offset.
  */
@@ -189,24 +179,24 @@ struct	swapinfo {
 /*
  * Get a vnode name for an anon slot.
  * The vnum, offset are derived from anon struct address which is
- * 16 bytes aligned. To get swap offset the anon address is shifted
- * by additional 11 bits which yields 32K aligned swap offset
- * (11 bits plus 4 bits alignment).
- * The vnum (vnode index) is created from bits 31-21.
- * The 64 bit swap offset is created from bits 63-32 and 20-4.
- * The 32 bit offset is created from bits 20-4.
+ * 16 bytes aligned.  anon structs may be kmem_cache_alloc'd concurrently by
+ * multiple threads and come from a small range of addresses (same slab), in
+ * which case high order AP bits do not vary much, so choose vnum from low
+ * order bits which vary the most.  Different threads will thus get different
+ * vnums and vnodes, which avoids vph_mutex_contention on the subsequent
+ * page_hashin().
  *
- * +-----------...----------+--------+-----------------------+----+
- * |        swap offset     |  vnum  |       swap offset     |0000|
- * +-----------...----------+--------+-----------------------+----+
- *  63	                  32 31    21 20	            4 3  0
+ * +-----------...-------------------+-----------------------+----+
+ * |        swap offset              |           vnum        |0000|
+ * +-----------...-------------------+-----------------------+----+
+ *  63                             15 14                    4 3   0
  */
 #define	swap_alloc(AP)							\
 {									\
-	(AP)->an_vp = swapfs_getvp(((uintptr_t)(AP) >> AN_VPSHIFT)	\
-	    & AN_VPSIZEMASK); 						\
-	(AP)->an_off = (anoff_t)(((uintptr_t)(AP) & ~(uintptr_t)0xFFFFFFFF) \
-	    | (((uintptr_t)(AP) << AN_OFFSHIFT) & (uintptr_t)0xFFFFFFFF)); \
+	(AP)->an_vp = swapfs_getvp(((uintptr_t)(AP) >> AN_CACHE_ALIGN_LOG2) \
+	    & AN_VPMASK); 						\
+	(AP)->an_off = (anoff_t)((((uintptr_t)(AP)) >>			\
+	    AN_VPSHIFT + AN_CACHE_ALIGN_LOG2) << PAGESHIFT);		\
 }
 
 /*
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index 8b75225a64..e9f247ae1a 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -261,6 +260,10 @@ typedef struct vnode {
 #define	IS_DEVVP(vp)	\
 	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
 
+#define	VNODE_ALIGN	64
+/* Count of low-order 0 bits in a vnode *, based on size and alignment. */
+#define	VNODE_ALIGN_LOG2	8
+
 /*
  * vnode flags.
  */
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index 652fcc0951..a2e07d0b18 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -91,6 +91,11 @@ struct anon {
 	int an_refcnt;		/* # of people sharing slot */
 };
 
+#define	AN_CACHE_ALIGN_LOG2	4	/* log2(AN_CACHE_ALIGN) */
+#define	AN_CACHE_ALIGN	(1U << AN_CACHE_ALIGN_LOG2) /* anon address aligned */
+						/* 16 bytes */
+
+
 #ifdef _KERNEL
 /*
  * The swapinfo_lock protects:
@@ -121,11 +126,24 @@ extern kcondvar_t anon_array_cv[];
  * Global hash table to provide a function from (vp, off) -> ap
  */
 extern size_t anon_hash_size;
+extern unsigned int anon_hash_shift;
 extern struct anon **anon_hash;
 #define	ANON_HASH_SIZE	anon_hash_size
 #define	ANON_HASHAVELEN	4
-#define	ANON_HASH(VP, OFF)	\
-((((uintptr_t)(VP) >> 7)  ^ ((OFF) >> PAGESHIFT)) & (ANON_HASH_SIZE - 1))
+/*
+ * Try to use as many bits of randomness from both vp and off as we can.
+ * This should help spreading evenly for a variety of workloads.  See comments
+ * for PAGE_HASH_FUNC for more explanation.
+ */
+#define	ANON_HASH(vp, off)	\
+	(((((uintptr_t)(off) >> PAGESHIFT) ^ \
+		((uintptr_t)(off) >> (PAGESHIFT + anon_hash_shift))) ^ \
+		(((uintptr_t)(vp) >> 3) ^ \
+		((uintptr_t)(vp) >> (3 + anon_hash_shift)) ^ \
+		((uintptr_t)(vp) >> (3 + 2 * anon_hash_shift)) ^ \
+		((uintptr_t)(vp) << \
+		    (anon_hash_shift - AN_VPSHIFT - VNODE_ALIGN_LOG2)))) & \
+		(anon_hash_size - 1))
 
 #define	AH_LOCK_SIZE	(2 << NCPU_LOG2)
 
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 026ea7c29b..7fa4af9a4a 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -102,10 +102,37 @@ typedef int	selock_t;
 #ifdef _KERNEL
 
 /*
- * Macros to acquire and release the page logical lock.
+ * PAGE_LLOCK_SIZE is 2 * NCPU, but no smaller than 128.
+ * PAGE_LLOCK_SHIFT is log2(PAGE_LLOCK_SIZE).
  */
-#define	page_struct_lock(pp)	mutex_enter(&page_llock)
-#define	page_struct_unlock(pp)	mutex_exit(&page_llock)
+#if ((2*NCPU_P2) > 128)
+#define	PAGE_LLOCK_SHIFT	((unsigned)(NCPU_LOG2 + 1))
+#else
+#define	PAGE_LLOCK_SHIFT	7U
+#endif
+#define	PAGE_LLOCK_SIZE (1 << PAGE_LLOCK_SHIFT)
+
+/*
+ * The number of low order 0 bits in the page_t address.
+ */
+#define	PP_SHIFT		7
+
+/*
+ * pp may be the root of a large page, and many low order bits will be 0.
+ * Shift and XOR multiple times to capture the good bits across the range of
+ * possible page sizes.
+ */
+#define	PAGE_LLOCK_HASH(pp)	\
+	(((((uintptr_t)(pp) >> PP_SHIFT) ^ \
+	((uintptr_t)(pp) >> (PAGE_LLOCK_SHIFT + PP_SHIFT))) ^ \
+	((uintptr_t)(pp) >> ((PAGE_LLOCK_SHIFT * 2) + PP_SHIFT)) ^ \
+	((uintptr_t)(pp) >> ((PAGE_LLOCK_SHIFT * 3) + PP_SHIFT))) & \
+	(PAGE_LLOCK_SIZE - 1))
+
+#define	page_struct_lock(pp)	\
+	mutex_enter(&page_llocks[PAGE_LLOCK_HASH(PP_PAGEROOT(pp))].pad_mutex)
+#define	page_struct_unlock(pp)	\
+	mutex_exit(&page_llocks[PAGE_LLOCK_HASH(PP_PAGEROOT(pp))].pad_mutex)
 
 #endif	/* _KERNEL */
 
@@ -171,7 +198,7 @@ struct as;
  *				p_next
  *				p_prev
  *
- * The following fields are protected by the global page_llock:
+ * The following fields are protected by the global page_llocks[]:
  *
  *				p_lckcnt
  *				p_cowcnt
@@ -348,8 +375,11 @@ struct as;
  *							    sleep while holding
  *							    this lock.
  *	=====================================================================
- *	p_lckcnt	p_selock(E,S)	p_selock(E) &&
- *	p_cowcnt			page_llock
+ *	p_lckcnt	p_selock(E,S)	p_selock(E)
+ *					    OR
+ *					p_selock(S) &&
+ *					page_llocks[]
+ *	p_cowcnt
  *	=====================================================================
  *	p_nrm		hat layer lock	hat layer lock
  *	p_mapping
@@ -535,44 +565,61 @@ typedef	page_t	devpage_t;
  * resulting hashed value.  Note that this will perform quickly, since the
  * shifting/summing are fast register to register operations with no additional
  * memory references).
+ *
+ * PH_SHIFT_SIZE is the amount to use for the successive shifts in the hash
+ * function below.  The actual value is LOG2(PH_TABLE_SIZE), so that as many
+ * bits as possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
  */
 #if defined(_LP64)
 
 #if NCPU < 4
 #define	PH_TABLE_SIZE	128
-#define	VP_SHIFT	7
+#define	PH_SHIFT_SIZE	7
 #else
-#define	PH_TABLE_SIZE	1024
-#define	VP_SHIFT	9
+#define	PH_TABLE_SIZE	(2 * NCPU_P2)
+#define	PH_SHIFT_SIZE	(NCPU_LOG2 + 1)
 #endif
 
 #else	/* 32 bits */
 
 #if NCPU < 4
 #define	PH_TABLE_SIZE	16
-#define	VP_SHIFT	7
+#define	PH_SHIFT_SIZE	4
 #else
 #define	PH_TABLE_SIZE	128
-#define	VP_SHIFT	9
+#define	PH_SHIFT_SIZE	7
 #endif
 
 #endif	/* _LP64 */
 
 /*
- * The amount to use for the successive shifts in the hash function below.
- * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as
- * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
+ *
+ * We take care to get as much randomness as possible from both the vp and
+ * the offset.  Workloads can have few vnodes with many offsets, many vnodes
+ * with few offsets or a moderate mix of both.  This hash should perform
+ * equally well for each of these possibilities and for all types of memory
+ * allocations.
+ *
+ * vnodes representing files are created over a long period of time and
+ * have good variation in the upper vp bits, and the right shifts below
+ * capture these bits.  However, swap vnodes are created quickly in a
+ * narrow vp* range.  Refer to comments at swap_alloc: vnum has exactly
+ * AN_VPSHIFT bits, so the kmem_alloc'd vnode addresses have approximately
+ * AN_VPSHIFT bits of variation above their VNODE_ALIGN low order 0 bits.
+ * Spread swap vnodes widely in the hash table by XOR'ing a term with the
+ * vp bits of variation left shifted to the top of the range.
  */
-#define	PH_SHIFT_SIZE   (7)
 
 #define	PAGE_HASHSZ	page_hashsz
 #define	PAGE_HASHAVELEN		4
 #define	PAGE_HASH_FUNC(vp, off) \
-	((((uintptr_t)(off) >> PAGESHIFT) + \
-		((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \
-		((uintptr_t)(vp) >> 3) + \
-		((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \
-		((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \
+	(((((uintptr_t)(off) >> PAGESHIFT) ^ \
+		((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE))) ^ \
+		(((uintptr_t)(vp) >> 3) ^ \
+		((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) ^ \
+		((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE)) ^ \
+		((uintptr_t)(vp) << \
+		(page_hashsz_shift - AN_VPSHIFT - VNODE_ALIGN_LOG2)))) & \
 		(PAGE_HASHSZ - 1))
 #ifdef _KERNEL
 
@@ -588,16 +635,10 @@ typedef	page_t	devpage_t;
  * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different
  * 64 byte sub-block.
  */
-typedef struct pad_mutex {
-	kmutex_t	pad_mutex;
-#ifdef _LP64
-	char		pad_pad[64 - sizeof (kmutex_t)];
-#endif
-} pad_mutex_t;
 extern pad_mutex_t ph_mutex[];
 
 #define	PAGE_HASH_MUTEX(x) \
-	&(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \
+	&(ph_mutex[((x) ^ ((x) >> PH_SHIFT_SIZE) + ((x) << 3)) & \
 		(PH_TABLE_SIZE - 1)].pad_mutex)
 
 /*
@@ -626,9 +667,10 @@ extern pad_mutex_t ph_mutex[];
 	((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp))
 
 extern	long page_hashsz;
+extern	unsigned int page_hashsz_shift;
 extern	page_t **page_hash;
 
-extern	kmutex_t page_llock;		/* page logical lock mutex */
+extern	pad_mutex_t page_llocks[];	/* page logical lock mutex */
 extern	kmutex_t freemem_lock;		/* freemem lock */
 
 extern	pgcnt_t	total_pages;		/* total pages in the system */
diff --git a/usr/src/uts/common/vm/page_lock.c b/usr/src/uts/common/vm/page_lock.c
index 8003884652..7e48602189 100644
--- a/usr/src/uts/common/vm/page_lock.c
+++ b/usr/src/uts/common/vm/page_lock.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 
@@ -42,14 +41,14 @@
 #include <vm/seg_kmem.h>
 
 /*
- * This global mutex is for logical page locking.
+ * This global mutex array is for logical page locking.
  * The following fields in the page structure are protected
  * by this lock:
  *
  *	p_lckcnt
  *	p_cowcnt
  */
-kmutex_t page_llock;
+pad_mutex_t page_llocks[8 * NCPU_P2];
 
 /*
  * This is a global lock for the logical page free list.  The
@@ -127,14 +126,10 @@ static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
  * an address of a vnode.
  */
 
-/*
- * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
- *	Need to review again.
- */
 #if defined(_LP64)
-#define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
+#define	VPH_TABLE_SIZE  (8 * NCPU_P2)
 #else	/* 32 bits */
-#define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
+#define	VPH_TABLE_SIZE	(2 * NCPU_P2)
 #endif
 
 #define	VP_HASH_FUNC(vp) \
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 666b98f389..31c293d416 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -6483,10 +6482,26 @@ segvn_claim_pages(
 	ASSERT(pg_idx <= pgcnt);
 	ppa[pg_idx] = NULL;
 
-	if (prot & PROT_WRITE)
-		err = page_addclaim_pages(ppa);
-	else
-		err = page_subclaim_pages(ppa);
+
+	/* Find each large page within ppa, and adjust its claim */
+
+	/* Does ppa cover a single large page? */
+	if (ppa[0]->p_szc == seg->s_szc) {
+		if (prot & PROT_WRITE)
+			err = page_addclaim_pages(ppa);
+		else
+			err = page_subclaim_pages(ppa);
+	} else {
+		for (i = 0; ppa[i]; i += pgcnt) {
+			ASSERT(IS_P2ALIGNED(page_pptonum(ppa[i]), pgcnt));
+			if (prot & PROT_WRITE)
+				err = page_addclaim_pages(&ppa[i]);
+			else
+				err = page_subclaim_pages(&ppa[i]);
+			if (err == 0)
+				break;
+		}
+	}
 
 	for (i = 0; i < pg_idx; i++) {
 		ASSERT(ppa[i] != NULL);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 6ded5d7192..4916f5d376 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -138,6 +138,7 @@ kcondvar_t	anon_array_cv[ANON_LOCKSIZE];
  */
 extern	int swap_maxcontig;
 size_t	anon_hash_size;
+unsigned int anon_hash_shift;
 struct anon **anon_hash;
 
 static struct kmem_cache *anon_cache;
@@ -199,7 +200,8 @@ anon_init(void)
 	pad_mutex_t *tmp;
 
 	/* These both need to be powers of 2 so round up to the next power */
-	anon_hash_size = 1L << highbit((physmem / ANON_HASHAVELEN) - 1);
+	anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
+	anon_hash_size = 1L << anon_hash_shift;
 
 	/*
 	 * We need to align the anonhash_lock and anonpages_hash_lock arrays
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index a35f7cc196..169b9c84e7 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -3977,11 +3977,27 @@ page_pp_useclaim(
 	uint_t	write_perm) 	/* set if vpage has PROT_WRITE */
 {
 	int payback = 0;
+	int nidx, oidx;
 
 	ASSERT(PAGE_LOCKED(opp));
 	ASSERT(PAGE_LOCKED(npp));
 
-	page_struct_lock(opp);
+	/*
+	 * Since we have two pages we probably have two locks.  We need to take
+	 * them in a defined order to avoid deadlocks.  It's also possible they
+	 * both hash to the same lock in which case this is a non-issue.
+	 */
+	nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
+	oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
+	if (nidx < oidx) {
+		page_struct_lock(npp);
+		page_struct_lock(opp);
+	} else if (oidx < nidx) {
+		page_struct_lock(opp);
+		page_struct_lock(npp);
+	} else {	/* The pages hash to the same lock */
+		page_struct_lock(npp);
+	}
 
 	ASSERT(npp->p_cowcnt == 0);
 	ASSERT(npp->p_lckcnt == 0);
@@ -4017,7 +4033,16 @@ page_pp_useclaim(
 		pages_useclaim--;
 		mutex_exit(&freemem_lock);
 	}
-	page_struct_unlock(opp);
+
+	if (nidx < oidx) {
+		page_struct_unlock(opp);
+		page_struct_unlock(npp);
+	} else if (oidx < nidx) {
+		page_struct_unlock(npp);
+		page_struct_unlock(opp);
+	} else {	/* The pages hash to the same lock */
+		page_struct_unlock(npp);
+	}
 }
 
 /*
@@ -4103,21 +4128,27 @@ page_subclaim(page_t *pp)
 	return (r);
 }
 
+/*
+ * Variant of page_addclaim(), where ppa[] contains the pages of a single large
+ * page.
+ */
 int
 page_addclaim_pages(page_t  **ppa)
 {
-
 	pgcnt_t	lckpgs = 0, pg_idx;
 
 	VM_STAT_ADD(pagecnt.pc_addclaim_pages);
 
-	mutex_enter(&page_llock);
+	/*
+	 * Only need to take the page struct lock on the large page root.
+	 */
+	page_struct_lock(ppa[0]);
 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
 
 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
 		ASSERT(ppa[pg_idx]->p_lckcnt != 0);
 		if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
-			mutex_exit(&page_llock);
+			page_struct_unlock(ppa[0]);
 			return (0);
 		}
 		if (ppa[pg_idx]->p_lckcnt > 1)
@@ -4131,7 +4162,7 @@ page_addclaim_pages(page_t  **ppa)
 			pages_claimed += lckpgs;
 		} else {
 			mutex_exit(&freemem_lock);
-			mutex_exit(&page_llock);
+			page_struct_unlock(ppa[0]);
 			return (0);
 		}
 		mutex_exit(&freemem_lock);
@@ -4141,10 +4172,14 @@ page_addclaim_pages(page_t  **ppa)
 		ppa[pg_idx]->p_lckcnt--;
 		ppa[pg_idx]->p_cowcnt++;
 	}
-	mutex_exit(&page_llock);
+	page_struct_unlock(ppa[0]);
 	return (1);
 }
 
+/*
+ * Variant of page_subclaim(), where ppa[] contains the pages of a single large
+ * page.
+ */
 int
 page_subclaim_pages(page_t  **ppa)
 {
@@ -4152,13 +4187,16 @@ page_subclaim_pages(page_t  **ppa)
 
 	VM_STAT_ADD(pagecnt.pc_subclaim_pages);
 
-	mutex_enter(&page_llock);
+	/*
+	 * Only need to take the page struct lock on the large page root.
+	 */
+	page_struct_lock(ppa[0]);
 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
 
 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
 		ASSERT(ppa[pg_idx]->p_cowcnt != 0);
 		if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
-			mutex_exit(&page_llock);
+			page_struct_unlock(ppa[0]);
 			return (0);
 		}
 		if (ppa[pg_idx]->p_lckcnt != 0)
@@ -4177,7 +4215,7 @@ page_subclaim_pages(page_t  **ppa)
 		ppa[pg_idx]->p_lckcnt++;
 
 	}
-	mutex_exit(&page_llock);
+	page_struct_unlock(ppa[0]);
 	return (1);
 }
 
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index 7b761da108..eda3552c03 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -60,6 +59,7 @@
 #include <sys/mem_cage.h>
 #include <sys/sdt.h>
 #include <sys/dumphdr.h>
+#include <sys/swap.h>
 
 extern uint_t	vac_colors;
 
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index ad719dfcdd..f69b37a9f2 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -289,6 +289,7 @@ int segzio_fromheap = 1;
  * VM data structures
  */
 long page_hashsz;		/* Size of page hash table (power of two) */
+unsigned int page_hashsz_shift;	/* log2(page_hashsz) */
 struct page *pp_base;		/* Base of initial system page struct array */
 struct page **page_hash;	/* Page hash table */
 pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
@@ -1126,12 +1127,15 @@ startup_memlist(void)
 	ADD_TO_ALLOCATIONS(bios_rsvd, rsvdmemlist_sz);
 	PRM_DEBUG(rsvdmemlist_sz);
 
+	/* LINTED */
+	ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), sizeof (struct page)));
 	/*
 	 * The page structure hash table size is a power of 2
 	 * such that the average hash chain length is PAGE_HASHAVELEN.
 	 */
 	page_hashsz = npages / PAGE_HASHAVELEN;
-	page_hashsz = 1 << highbit(page_hashsz);
+	page_hashsz_shift = highbit(page_hashsz);
+	page_hashsz = 1 << page_hashsz_shift;
 	pagehash_sz = sizeof (struct page *) * page_hashsz;
 	ADD_TO_ALLOCATIONS(page_hash, pagehash_sz);
 	PRM_DEBUG(pagehash_sz);
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index ed3c5d5fcc..a0fa08db16 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -60,6 +60,9 @@ extern "C" {
 #define	NCPU_LOG2	5
 #endif
 
+/* NCPU_P2 is NCPU rounded to a power of 2 */
+#define	NCPU_P2	(1 << NCPU_LOG2)
+
 /*
  * The value defined below could grow to 16. hat structure and
  * page_t have room for 16 nodes.
diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c
index 79c0ee073e..dfdca87e1c 100644
--- a/usr/src/uts/i86pc/vm/vm_machdep.c
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /*
  * Copyright (c) 2010, Intel Corporation.
@@ -58,6 +57,7 @@
 #include <sys/exechdr.h>
 #include <sys/debug.h>
 #include <sys/vmsystm.h>
+#include <sys/swap.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
index 6156017a5e..baaf08d7e5 100644
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -775,15 +774,26 @@ kpm_shlk_t	*kpmp_stable;
 uint_t		kpmp_stable_sz;	/* must be a power of 2 */
 
 /*
- * SPL_HASH was improved to avoid false cache line sharing
+ * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
+ * SPL_SHIFT is log2(SPL_TABLE_SIZE).
  */
-#define	SPL_TABLE_SIZE	128
+#if ((2*NCPU_P2) > 128)
+#define	SPL_SHIFT	((unsigned)(NCPU_LOG2 + 1))
+#else
+#define	SPL_SHIFT	7U
+#endif
+#define	SPL_TABLE_SIZE	(1U << SPL_SHIFT)
 #define	SPL_MASK	(SPL_TABLE_SIZE - 1)
-#define	SPL_SHIFT	7		/* log2(SPL_TABLE_SIZE) */
 
+/*
+ * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
+ * and by multiples of SPL_SHIFT to get as many varied bits as we can.
+ */
 #define	SPL_INDEX(pp) \
-	((((uintptr_t)(pp) >> SPL_SHIFT) ^ \
-	((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \
+	((((uintptr_t)(pp) >> PP_SHIFT) ^ \
+	((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
+	((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
+	((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
 	(SPL_TABLE_SIZE - 1))
 
 #define	SPL_HASH(pp)    \
diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c
index 914451cf4f..fe1dffc468 100644
--- a/usr/src/uts/sun4/os/startup.c
+++ b/usr/src/uts/sun4/os/startup.c
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/machsystm.h>
@@ -62,6 +61,7 @@
 #include <sys/memnode.h>
 #include <sys/mem_cage.h>
 #include <sys/mmu.h>
+#include <sys/swap.h>
 
 extern void setup_trap_table(void);
 extern int cpu_intrq_setup(struct cpu *);
@@ -174,6 +174,7 @@ pgcnt_t obp_pages;		/* Physical pages used by OBP */
  * VM data structures
  */
 long page_hashsz;		/* Size of page hash table (power of two) */
+unsigned int page_hashsz_shift;	/* log2(page_hashsz) */
 struct page *pp_base;		/* Base of system page struct array */
 size_t pp_sz;			/* Size in bytes of page struct array */
 struct page **page_hash;	/* Page hash table */
@@ -748,13 +749,16 @@ calc_kpmpp_sz(pgcnt_t npages)
 size_t
 calc_pagehash_sz(pgcnt_t npages)
 {
-
+	/* LINTED */
+	ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), (sizeof (struct page))));
 	/*
 	 * The page structure hash table size is a power of 2
 	 * such that the average hash chain length is PAGE_HASHAVELEN.
 	 */
 	page_hashsz = npages / PAGE_HASHAVELEN;
-	page_hashsz = 1 << highbit(page_hashsz);
+	page_hashsz_shift = MAX((AN_VPSHIFT + VNODE_ALIGN_LOG2 + 1),
+	    highbit(page_hashsz));
+	page_hashsz = 1 << page_hashsz_shift;
 	return (page_hashsz * sizeof (struct page *));
 }
 
diff --git a/usr/src/uts/sun4u/sys/machparam.h b/usr/src/uts/sun4u/sys/machparam.h
index e60d02a2cc..b0130af21f 100644
--- a/usr/src/uts/sun4u/sys/machparam.h
+++ b/usr/src/uts/sun4u/sys/machparam.h
@@ -103,6 +103,9 @@ extern "C" {
 #error	"add test for larger NCPU"
 #endif
 
+/* NCPU_P2 is NCPU rounded to a power of 2 */
+#define	NCPU_P2	(1 << NCPU_LOG2)
+
 /*
  * Maximum number of processors that we support.  With CMP processors, the
  * portid may not be equal to cpuid.  MAX_CPU_CHIPID can be defined in a
diff --git a/usr/src/uts/sun4v/sys/machparam.h b/usr/src/uts/sun4v/sys/machparam.h
index 4fe46b985b..b47b33c370 100644
--- a/usr/src/uts/sun4v/sys/machparam.h
+++ b/usr/src/uts/sun4v/sys/machparam.h
@@ -63,6 +63,9 @@ extern "C" {
 #error "Must define NCPU_LOG2 together with NCPU"
 #endif
 
+/* NCPU_P2 is NCPU rounded to a power of 2 */
+#define	NCPU_P2	(1 << NCPU_LOG2)
+
 /*
  * Maximum number of processors that we support.  With CMP processors, the
  * portid may not be equal to cpuid.  MAX_CPU_CHIPID can be defined in a
author	Peter Rival <Frank.Rival@oracle.com>	2010-04-23 13:26:05 -0400
committer	Peter Rival <Frank.Rival@oracle.com>	2010-04-23 13:26:05 -0400
commit	cb15d5d96b3b2730714c28bfe06cfe7421758b8c (patch)
tree	7fd5c3cf5bb49647be8b2eb022e8d75a7d78eab5
parent	03c76a6ef5c04e818b6badeeb6155961505af45c (diff)
download	illumos-joyent-cb15d5d96b3b2730714c28bfe06cfe7421758b8c.tar.gz