summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authoraguzovsk <none@none>2007-06-08 18:58:50 -0700
committeraguzovsk <none@none>2007-06-08 18:58:50 -0700
commit2cb27123907a098a777e39eebc349d73e99a518f (patch)
treec95bccd47a25fb4e5c193a71cfa32f4747f03032 /usr/src
parent01e689ccb14727455003b77ee332005223497875 (diff)
downloadillumos-joyent-2cb27123907a098a777e39eebc349d73e99a518f.tar.gz
6544121 Implement text replication
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/mdb/sun4u/modules/wrsm/v9/Makefile8
-rw-r--r--usr/src/cmd/mdb/sun4u/modules/wrsmd/v9/Makefile3
-rw-r--r--usr/src/cmd/mdb/sun4v/modules/ldc/v9/Makefile2
-rw-r--r--usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile4
-rw-r--r--usr/src/uts/common/os/exec.c10
-rw-r--r--usr/src/uts/common/os/fork.c3
-rw-r--r--usr/src/uts/common/os/lgrp.c61
-rw-r--r--usr/src/uts/common/os/lwp.c14
-rw-r--r--usr/src/uts/common/os/schedctl.c2
-rw-r--r--usr/src/uts/common/os/shm.c2
-rw-r--r--usr/src/uts/common/sys/file.h4
-rw-r--r--usr/src/uts/common/sys/lgrp.h12
-rw-r--r--usr/src/uts/common/sys/mman.h4
-rw-r--r--usr/src/uts/common/sys/proc.h7
-rw-r--r--usr/src/uts/common/sys/types.h2
-rw-r--r--usr/src/uts/common/vm/anon.h18
-rw-r--r--usr/src/uts/common/vm/page.h3
-rw-r--r--usr/src/uts/common/vm/seg_spt.c2
-rw-r--r--usr/src/uts/common/vm/seg_vn.c1054
-rw-r--r--usr/src/uts/common/vm/seg_vn.h59
-rw-r--r--usr/src/uts/common/vm/vm_anon.c41
-rw-r--r--usr/src/uts/common/vm/vm_as.c4
-rw-r--r--usr/src/uts/common/vm/vm_page.c15
-rw-r--r--usr/src/uts/common/vm/vm_pagelist.c36
-rw-r--r--usr/src/uts/common/vm/vm_usage.c11
-rw-r--r--usr/src/uts/i86pc/os/mlsetup.c2
-rw-r--r--usr/src/uts/i86pc/os/startup.c8
-rw-r--r--usr/src/uts/sun4/os/mlsetup.c2
28 files changed, 1245 insertions, 148 deletions
diff --git a/usr/src/cmd/mdb/sun4u/modules/wrsm/v9/Makefile b/usr/src/cmd/mdb/sun4u/modules/wrsm/v9/Makefile
index 8399b25ab5..780ca8144c 100644
--- a/usr/src/cmd/mdb/sun4u/modules/wrsm/v9/Makefile
+++ b/usr/src/cmd/mdb/sun4u/modules/wrsm/v9/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
# CDDL HEADER END
#
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#ident "%Z%%M% %I% %E% SMI"
@@ -40,5 +39,6 @@ CPPFLAGS += -DMP -D_MACHDEP
CPPFLAGS += -D_KERNEL
CPPFLAGS += -I../../../../common
CPPFLAGS += -I$(SRC)/uts/sun4u
+CPPFLAGS += -I$(SRC)/uts/sun4
CPPFLAGS += -I$(SRC)/uts/sfmmu
CPPFLAGS += -I$(SRC)/uts/sparc/v9
diff --git a/usr/src/cmd/mdb/sun4u/modules/wrsmd/v9/Makefile b/usr/src/cmd/mdb/sun4u/modules/wrsmd/v9/Makefile
index 0f8a310b25..649aeb25dd 100644
--- a/usr/src/cmd/mdb/sun4u/modules/wrsmd/v9/Makefile
+++ b/usr/src/cmd/mdb/sun4u/modules/wrsmd/v9/Makefile
@@ -20,7 +20,7 @@
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#ident "%Z%%M% %I% %E% SMI"
@@ -40,5 +40,6 @@ CPPFLAGS += -DMP -D_MACHDEP
CPPFLAGS += -D_KERNEL
CPPFLAGS += -I../../../../../../../src/cmd/mdb/common
CPPFLAGS += -I$(SRC)/uts/sun4u
+CPPFLAGS += -I$(SRC)/uts/sun4
CPPFLAGS += -I$(SRC)/uts/sfmmu
CPPFLAGS += -I$(SRC)/uts/sparc/v9
diff --git a/usr/src/cmd/mdb/sun4v/modules/ldc/v9/Makefile b/usr/src/cmd/mdb/sun4v/modules/ldc/v9/Makefile
index 9bfdd1e71c..8f53c91ab1 100644
--- a/usr/src/cmd/mdb/sun4v/modules/ldc/v9/Makefile
+++ b/usr/src/cmd/mdb/sun4v/modules/ldc/v9/Makefile
@@ -42,3 +42,5 @@ MODSRCS_DIR = ../
CPPFLAGS += -DMP -D_MACHDEP
CPPFLAGS += -Dsun4v
CPPFLAGS += -I$(SRC)/uts/sun4v
+CPPFLAGS += -I$(SRC)/uts/sun4
+CPPFLAGS += -I$(SRC)/uts/sparc/v9
diff --git a/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile b/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile
index a449a6b174..42ffab660e 100644
--- a/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile
+++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile
@@ -20,7 +20,7 @@
#
#
-# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -42,3 +42,5 @@ MODSRCS_DIR = ../
CPPFLAGS += -DMP -D_MACHDEP
CPPFLAGS += -Dsun4v
CPPFLAGS += -I$(SRC)/uts/sun4v
+CPPFLAGS += -I$(SRC)/uts/sun4
+CPPFLAGS += -I$(SRC)/uts/sparc/v9
diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c
index 3e8f45a8a7..a17678863b 100644
--- a/usr/src/uts/common/os/exec.c
+++ b/usr/src/uts/common/os/exec.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -49,6 +49,7 @@
#include <sys/debug.h>
#include <sys/pathname.h>
#include <sys/vm.h>
+#include <sys/lgrp.h>
#include <sys/vtrace.h>
#include <sys/exec.h>
#include <sys/exechdr.h>
@@ -432,6 +433,13 @@ exec_common(const char *fname, const char **argp, const char **envp,
*/
ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
curthread->t_tid = 1;
+ kpreempt_disable();
+ ASSERT(curthread->t_lpl != NULL);
+ p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
+ kpreempt_enable();
+ if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
+ lgrp_update_trthr_migrations(1);
+ }
curthread->t_unpark = 0;
curthread->t_proc_flag |= TP_TWAIT;
curthread->t_proc_flag &= ~TP_DAEMON; /* daemons shouldn't exec */
diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c
index 8e11ec7437..e533c00456 100644
--- a/usr/src/uts/common/os/fork.c
+++ b/usr/src/uts/common/os/fork.c
@@ -946,6 +946,8 @@ getproc(proc_t **cpp, int kernel)
* look at the p_zone field.
*/
cp->p_zone = pp->p_zone;
+ cp->p_t1_lgrpid = LGRP_NONE;
+ cp->p_tr_lgrpid = LGRP_NONE;
if ((newpid = pid_allocate(cp, PID_ALLOC_PROC)) == -1) {
if (nproc == v.v_proc) {
@@ -1283,6 +1285,7 @@ try_again:
kpreempt_enable();
as_free(as);
+ p->p_tr_lgrpid = LGRP_NONE;
}
}
}
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index 346a57c82f..b325bd7f9b 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -410,7 +410,8 @@ lgrp_main_init(void)
* Enforce a valid lgrp_mem_default_policy
*/
if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
- (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
+ (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
+ (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
/*
@@ -3183,6 +3184,26 @@ lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
}
/*
+ * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
+ * process that uses text replication changed home lgrp. This info is used by
+ * segvn asyncronous thread to detect if it needs to recheck what lgrps
+ * should be used for text replication.
+ */
+static uint64_t lgrp_trthr_moves = 0;
+
+uint64_t
+lgrp_get_trthr_migrations(void)
+{
+ return (lgrp_trthr_moves);
+}
+
+void
+lgrp_update_trthr_migrations(uint64_t incr)
+{
+ atomic_add_64(&lgrp_trthr_moves, incr);
+}
+
+/*
* An LWP is expected to be assigned to an lgroup for at least this long
* for its anticipatory load to be justified. NOTE that this value should
* not be set extremely huge (say, larger than 100 years), to avoid problems
@@ -3332,6 +3353,14 @@ lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
* This thread is moving to a new lgroup
*/
t->t_lpl = newlpl;
+ if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
+ p->p_t1_lgrpid = newlpl->lpl_lgrpid;
+ membar_producer();
+ if (p->p_tr_lgrpid != LGRP_NONE &&
+ p->p_tr_lgrpid != p->p_t1_lgrpid) {
+ lgrp_update_trthr_migrations(1);
+ }
+ }
/*
* Reflect move in load average of new lgroup
@@ -3493,7 +3522,7 @@ lgrp_privm_policy_set(lgrp_mem_policy_t policy,
* Set policy
*/
policy_info->mem_policy = policy;
- policy_info->mem_reserved = 0;
+ policy_info->mem_lgrpid = LGRP_NONE;
return (0);
}
@@ -3604,8 +3633,22 @@ lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
policy = LGRP_MEM_POLICY_RANDOM;
} else {
policy_info = lgrp_mem_policy_get(seg, vaddr);
- if (policy_info != NULL)
+ if (policy_info != NULL) {
policy = policy_info->mem_policy;
+ if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
+ lgrp_id_t id = policy_info->mem_lgrpid;
+ ASSERT(id != LGRP_NONE);
+ ASSERT(id < NLGRPS_MAX);
+ lgrp = lgrp_table[id];
+ if (!LGRP_EXISTS(lgrp)) {
+ policy = LGRP_MEM_POLICY_NEXT;
+ } else {
+ lgrp_stat_add(id,
+ LGRP_NUM_NEXT_SEG, 1);
+ return (lgrp);
+ }
+ }
+ }
}
}
lgrpset = 0;
@@ -4167,7 +4210,7 @@ lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
KM_SLEEP);
newseg->shm_policy.mem_policy = policy;
- newseg->shm_policy.mem_reserved = 0;
+ newseg->shm_policy.mem_lgrpid = LGRP_NONE;
newseg->shm_off = off;
avl_insert(tree, newseg, where);
@@ -4229,7 +4272,7 @@ lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
* Set policy and update current length
*/
seg->shm_policy.mem_policy = policy;
- seg->shm_policy.mem_reserved = 0;
+ seg->shm_policy.mem_lgrpid = LGRP_NONE;
len = 0;
/*
@@ -4262,7 +4305,8 @@ lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
*/
if (eoff == oldeoff) {
newseg->shm_policy.mem_policy = policy;
- newseg->shm_policy.mem_reserved = 0;
+ newseg->shm_policy.mem_lgrpid =
+ LGRP_NONE;
(void) lgrp_shm_policy_concat(tree,
newseg, AVL_NEXT(tree, newseg));
break;
@@ -4278,12 +4322,13 @@ lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
(void) lgrp_shm_policy_split(tree,
newseg, eoff);
newseg->shm_policy.mem_policy = policy;
- newseg->shm_policy.mem_reserved = 0;
+ newseg->shm_policy.mem_lgrpid =
+ LGRP_NONE;
} else {
(void) lgrp_shm_policy_split(tree, seg,
eoff);
seg->shm_policy.mem_policy = policy;
- seg->shm_policy.mem_reserved = 0;
+ seg->shm_policy.mem_lgrpid = LGRP_NONE;
}
if (off == seg->shm_off)
diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c
index c1d1a870e0..60eee244dd 100644
--- a/usr/src/uts/common/os/lwp.c
+++ b/usr/src/uts/common/os/lwp.c
@@ -462,6 +462,17 @@ grow:
branded = 1;
}
+ if (t->t_tid == 1) {
+ kpreempt_disable();
+ ASSERT(t->t_lpl != NULL);
+ p->p_t1_lgrpid = t->t_lpl->lpl_lgrpid;
+ kpreempt_enable();
+ if (p->p_tr_lgrpid != LGRP_NONE &&
+ p->p_tr_lgrpid != p->p_t1_lgrpid) {
+ lgrp_update_trthr_migrations(1);
+ }
+ }
+
p->p_lwpcnt++;
t->t_waitfor = -1;
@@ -886,6 +897,9 @@ lwp_cleanup(void)
*/
kpreempt_disable();
lgrp_move_thread(t, NULL, 1);
+ if (t->t_tid == 1) {
+ p->p_t1_lgrpid = LGRP_NONE;
+ }
kpreempt_enable();
lwp_ctmpl_clear(ttolwp(t));
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 0c964aa399..8a189b3c97 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -563,7 +563,7 @@ schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
* Set up anonymous memory struct. No swap reservation is
* needed since the page will be locked into memory.
*/
- amp = anonmap_alloc(PAGESIZE, 0);
+ amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP);
/*
* Allocate the page.
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index a488dbf4e0..cd99615eef 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -881,7 +881,7 @@ top:
* unreserving the above anon, and freeing the below amp.
*/
- sp->shm_amp = anonmap_alloc(rsize, rsize);
+ sp->shm_amp = anonmap_alloc(rsize, rsize, ANON_SLEEP);
sp->shm_amp->a_sp = sp;
/*
* Store the original user's requested size, in bytes,
diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h
index 20300002c5..e893696564 100644
--- a/usr/src/uts/common/sys/file.h
+++ b/usr/src/uts/common/sys/file.h
@@ -23,7 +23,7 @@
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -182,7 +182,7 @@ extern int closef(file_t *);
extern int closeandsetf(int, file_t *);
extern int ufalloc_file(int, file_t *);
extern int ufalloc(int);
-extern int ufcanalloc(proc_t *, uint_t);
+extern int ufcanalloc(struct proc *, uint_t);
extern int falloc(struct vnode *, int, file_t **, int *);
extern void finit(void);
extern void unfalloc(file_t *);
diff --git a/usr/src/uts/common/sys/lgrp.h b/usr/src/uts/common/sys/lgrp.h
index 8f35095adf..c0ed75d981 100644
--- a/usr/src/uts/common/sys/lgrp.h
+++ b/usr/src/uts/common/sys/lgrp.h
@@ -41,9 +41,6 @@ extern "C" {
#define LGRP_NONE (-1) /* non-existent lgroup ID */
-
-typedef id_t lgrp_id_t; /* lgroup ID */
-
#if (!defined(_KERNEL) && !defined(_KMEMUSER))
typedef struct lgrp_mem_policy_info { int opaque[2]; } lgrp_mem_policy_info_t;
#endif /* !_KERNEL && !_KMEMUSER */
@@ -160,6 +157,7 @@ typedef enum lgrp_stat_types {
LGRP_NUM_RANDOM_PROC, /* # of times random proc policy applied */
LGRP_NUM_RANDOM_PSET, /* # of times random pset policy applied */
LGRP_NUM_ROUNDROBIN, /* # of times round robin policy applied */
+ LGRP_NUM_NEXT_SEG, /* # of times next to seg policy applied */
LGRP_NUM_COUNTER_STATS, /* always last */
LGRP_CTR_STATS_ALLOC = 16 /* cache-align pad - multiple of 8 */
/* always keep >= LGRP_NUM_COUNTER_STATS */
@@ -193,6 +191,7 @@ static char *lgrp_kstat_names[] = { \
"span process policy", \
"span psrset policy", \
"round robin policy", \
+ "next-seg policy", \
\
/* Snapshot stats */ \
"cpus", \
@@ -311,6 +310,7 @@ typedef enum lgrp_mem_policy {
LGRP_MEM_POLICY_RANDOM, /* randomly across all lgroups */
LGRP_MEM_POLICY_ROUNDROBIN, /* round robin across all lgroups */
LGRP_MEM_POLICY_NEXT_CPU, /* Near next CPU to touch memory */
+ LGRP_MEM_POLICY_NEXT_SEG, /* lgrp specified directly by seg */
LGRP_NUM_MEM_POLICIES
} lgrp_mem_policy_t;
@@ -339,8 +339,8 @@ typedef struct lgrp_mnode_cookie {
* Information needed to implement memory allocation policy
*/
typedef struct lgrp_mem_policy_info {
- int mem_policy; /* memory allocation policy */
- int mem_reserved; /* reserved */
+ int mem_policy; /* memory allocation policy */
+ lgrp_id_t mem_lgrpid; /* lgroup id */
} lgrp_mem_policy_info_t;
/*
@@ -561,6 +561,8 @@ lgrp_t *lgrp_home_lgrp(void);
lgrp_id_t lgrp_home_id(kthread_t *);
void lgrp_loadavg(lpl_t *, uint_t, int);
void lgrp_move_thread(kthread_t *, lpl_t *, int);
+uint64_t lgrp_get_trthr_migrations(void);
+void lgrp_update_trthr_migrations(uint64_t);
/*
* lgroup topology
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index ffbf284f3d..5132833ed0 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -82,6 +82,10 @@ extern "C" {
#define MAP_TEXT 0x400 /* map code segment */
#define MAP_INITDATA 0x800 /* map data segment */
+#ifdef _KERNEL
+#define _MAP_TEXTREPL 0x1000
+#endif /* _KERNEL */
+
/* these flags not yet implemented */
#define MAP_RENAME 0x20 /* rename private pages to file */
diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h
index a6320da055..93ceeaf604 100644
--- a/usr/src/uts/common/sys/proc.h
+++ b/usr/src/uts/common/sys/proc.h
@@ -216,8 +216,11 @@ typedef struct proc {
uint_t p_tidhash_sz; /* number of p_tidhash[] entries */
uint64_t p_lgrpset; /* unprotected hint of set of lgrps */
/* on which process has threads */
- uintptr_t p_lgrpres1; /* reserved for lgrp migration */
- uintptr_t p_lgrpres2; /* reserved for lgrp migration */
+ volatile lgrp_id_t p_t1_lgrpid; /* main's thread lgroup id */
+ volatile lgrp_id_t p_tr_lgrpid; /* text replica's lgroup id */
+#if defined(_LP64)
+ uintptr_t p_lgrpres2; /* reserved for lgrp migration */
+#endif
/*
* /proc (process filesystem) debugger interface stuff.
*/
diff --git a/usr/src/uts/common/sys/types.h b/usr/src/uts/common/sys/types.h
index 3ff5497cef..1745b91e87 100644
--- a/usr/src/uts/common/sys/types.h
+++ b/usr/src/uts/common/sys/types.h
@@ -300,6 +300,8 @@ typedef int id_t;
typedef long id_t; /* (historical version) */
#endif
+typedef id_t lgrp_id_t; /* lgroup ID */
+
/*
* Type useconds_t is an unsigned integral type capable of storing
* values at least in the range of zero to 1,000,000.
diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h
index ed59ec590b..0bd457fcd3 100644
--- a/usr/src/uts/common/vm/anon.h
+++ b/usr/src/uts/common/vm/anon.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -371,10 +371,10 @@ extern int swap_getconpage(struct vnode *, u_offset_t, size_t,
extern int anon_map_getpages(struct anon_map *, ulong_t,
uint_t, struct seg *, caddr_t, uint_t,
uint_t *, page_t *[], uint_t *,
- struct vpage [], enum seg_rw, int, int, struct cred *);
+ struct vpage [], enum seg_rw, int, int, int, struct cred *);
extern int anon_map_privatepages(struct anon_map *, ulong_t,
uint_t, struct seg *, caddr_t, uint_t,
- page_t *[], struct vpage [], int, struct cred *);
+ page_t *[], struct vpage [], int, int, struct cred *);
extern struct page *anon_private(struct anon **, struct seg *,
caddr_t, uint_t, struct page *,
int, struct cred *);
@@ -388,9 +388,9 @@ extern int anon_map_demotepages(struct anon_map *, ulong_t,
struct seg *, caddr_t, uint_t,
struct vpage [], struct cred *);
extern void anon_shmap_free_pages(struct anon_map *, ulong_t, size_t);
-extern int anon_resvmem(size_t, boolean_t, zone_t *);
+extern int anon_resvmem(size_t, boolean_t, zone_t *, int);
extern void anon_unresvmem(size_t, zone_t *);
-extern struct anon_map *anonmap_alloc(size_t, size_t);
+extern struct anon_map *anonmap_alloc(size_t, size_t, int);
extern void anonmap_free(struct anon_map *);
extern void anon_decref(struct anon *);
extern int non_anon(struct anon_hdr *, ulong_t, u_offset_t *, size_t *);
@@ -424,9 +424,11 @@ extern void anon_array_exit(anon_sync_obj_t *);
*/
#define anon_unresv(size) anon_unresvmem(size, curproc->p_zone)
#define anon_unresv_zone(size, zone) anon_unresvmem(size, zone)
-#define anon_resv(size) anon_resvmem((size), 1, curproc->p_zone)
-#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone)
-#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone)
+#define anon_resv(size) \
+ anon_resvmem((size), 1, curproc->p_zone, 1)
+#define anon_resv_zone(size, zone) anon_resvmem((size), 1, zone, 1)
+#define anon_checkspace(size, zone) anon_resvmem((size), 0, zone, 1)
+#define anon_try_resv_zone(size, zone) anon_resvmem((size), 1, zone, 0)
/*
* Flags to anon_private
diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h
index 940b32946b..45a3811eaf 100644
--- a/usr/src/uts/common/vm/page.h
+++ b/usr/src/uts/common/vm/page.h
@@ -614,6 +614,7 @@ extern pad_mutex_t ph_mutex[];
/* Page must be PP_ISNORELOC */
#define PG_PANIC 0x0020 /* system will panic if alloc fails */
#define PG_PUSHPAGE 0x0040 /* alloc may use reserve */
+#define PG_LOCAL 0x0080 /* alloc from given lgrp only */
/*
* When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
@@ -669,7 +670,7 @@ int page_exists_forreal(struct vnode *, u_offset_t, uint_t *);
void page_needfree(spgcnt_t);
page_t *page_create(struct vnode *, u_offset_t, size_t, uint_t);
int page_alloc_pages(struct vnode *, struct seg *, caddr_t, page_t **,
- page_t **, uint_t, int);
+ page_t **, uint_t, int, int);
page_t *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes,
uint_t flags, struct seg *seg, caddr_t vaddr, void *arg);
page_t *page_create_va(struct vnode *, u_offset_t, size_t, uint_t,
diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c
index a68276a1b5..d694d68d7d 100644
--- a/usr/src/uts/common/vm/seg_spt.c
+++ b/usr/src/uts/common/vm/seg_spt.c
@@ -2337,7 +2337,7 @@ spt_anon_getpages(
ppa_szc = (uint_t)-1;
ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
- &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
+ &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
anon_array_exit(&cookie);
if (ierr != 0) {
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 23160a1e22..dfca552662 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -58,6 +58,7 @@
#include <sys/sysmacros.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
+#include <sys/callb.h>
#include <sys/vm.h>
#include <sys/dumphdr.h>
#include <sys/lgrp.h>
@@ -235,6 +236,7 @@ segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
+ svd->svn_trnext = svd->svn_trprev = NULL;
return (0);
}
@@ -248,6 +250,14 @@ segvn_cache_destructor(void *buf, void *cdrarg)
mutex_destroy(&svd->segp_slock);
}
+/*ARGSUSED*/
+static int
+svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ bzero(buf, sizeof (svntr_t));
+ return (0);
+}
+
/*
* Patching this variable to non-zero allows the system to run with
* stacks marked as "not executable". It's a bit of a kludge, but is
@@ -289,6 +299,78 @@ ulong_t segvn_faultvnmpss_align_err5;
ulong_t segvn_vmpss_pageio_deadlk_err;
/*
+ * Segvn supports text replication optimization for NUMA platforms. Text
+ * replica's are represented by anon maps (amp). There's one amp per text file
+ * region per lgroup. A process chooses the amp for each of its text mappings
+ * based on the lgroup assignment of its main thread (t_tid = 1). All
+ * processes that want a replica on a particular lgroup for the same text file
+ * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
+ * with vp,off,size,szc used as a key. Text replication segments are read only
+ * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
+ * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
+ * pages. Replication amp is assigned to a segment when it gets its first
+ * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
+ * rechecks periodically if the process still maps an amp local to the main
+ * thread. If not async thread forces process to remap to an amp in the new
+ * home lgroup of the main thread. Current text replication implementation
+ * only provides the benefit to workloads that do most of their work in the
+ * main thread of a process or all the threads of a process run in the same
+ * lgroup. To extend text replication benefit to different types of
+ * multithreaded workloads further work would be needed in the hat layer to
+ * allow the same virtual address in the same hat to simultaneously map
+ * different physical addresses (i.e. page table replication would be needed
+ * for x86).
+ *
+ * amp pages are used instead of vnode pages as long as segment has a very
+ * simple life cycle. It's created via segvn_create(), handles S_EXEC
+ * (S_READ) pagefaults and is fully unmapped. If anything more complicated
+ * happens such as protection is changed, real COW fault happens, pagesize is
+ * changed, MC_LOCK is requested or segment is partially unmapped we turn off
+ * text replication by converting the segment back to vnode only segment
+ * (unmap segment's address range and set svd->amp to NULL).
+ *
+ * The original file can be changed after amp is inserted into
+ * svntr_hashtab. Processes that are launched after the file is already
+ * changed can't use the replica's created prior to the file change. To
+ * implement this functionality hash entries are timestamped. Replica's can
+ * only be used if current file modification time is the same as the timestamp
+ * saved when hash entry was created. However just timestamps alone are not
+ * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
+ * deal with file changes via MAP_SHARED mappings differently. When writable
+ * MAP_SHARED mappings are created to vnodes marked as executable we mark all
+ * existing replica's for this vnode as not usable for future text
+ * mappings. And we don't create new replica's for files that currently have
+ * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
+ * true).
+ */
+
+#define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
+size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
+
+static ulong_t svntr_hashtab_sz = 512;
+static svntr_bucket_t *svntr_hashtab = NULL;
+static struct kmem_cache *svntr_cache;
+static svntr_stats_t *segvn_textrepl_stats;
+static ksema_t segvn_trasync_sem;
+
+int segvn_disable_textrepl = 0;
+size_t textrepl_size_thresh = (size_t)-1;
+size_t segvn_textrepl_bytes = 0;
+size_t segvn_textrepl_max_bytes = 0;
+clock_t segvn_update_textrepl_interval = 0;
+int segvn_update_tr_time = 10;
+int segvn_disable_textrepl_update = 0;
+
+static void segvn_textrepl(struct seg *);
+static void segvn_textunrepl(struct seg *, int);
+static void segvn_inval_trcache(vnode_t *);
+static void segvn_trasync_thread(void);
+static void segvn_trupdate_wakeup(void *);
+static void segvn_trupdate(void);
+static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
+ ulong_t);
+
+/*
* Initialize segvn data structures
*/
void
@@ -324,6 +406,28 @@ segvn_init(void)
}
if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
segvn_maxpgszc = maxszc;
+
+ if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
+ !segvn_disable_textrepl) {
+ ulong_t i;
+ size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
+
+ svntr_cache = kmem_cache_create("svntr_cache",
+ sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
+ NULL, NULL, NULL, 0);
+ svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
+ for (i = 0; i < svntr_hashtab_sz; i++) {
+ mutex_init(&svntr_hashtab[i].tr_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ }
+ segvn_textrepl_max_bytes = ptob(physmem) /
+ segvn_textrepl_max_bytes_factor;
+ segvn_textrepl_stats = kmem_zalloc(NCPU *
+ sizeof (svntr_stats_t), KM_SLEEP);
+ sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
+ (void) thread_create(NULL, 0, segvn_trasync_thread,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
+ }
}
#define SEGVN_PAGEIO ((void *)0x1)
@@ -372,6 +476,7 @@ segvn_create(struct seg *seg, void *argsp)
int error = 0;
size_t pgsz;
lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
+ int trok = 0;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
@@ -454,7 +559,7 @@ segvn_create(struct seg *seg, void *argsp)
}
/* Inform the vnode of the new mapping */
- if (a->vp) {
+ if (a->vp != NULL) {
error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
seg->s_as, seg->s_base, seg->s_size, a->prot,
a->maxprot, a->type, cred);
@@ -470,15 +575,22 @@ segvn_create(struct seg *seg, void *argsp)
seg->s_size, HAT_UNLOAD_UNMAP);
return (error);
}
+ trok = ((a->flags & MAP_TEXT) &&
+ (seg->s_size > textrepl_size_thresh ||
+ (a->flags & _MAP_TEXTREPL)) &&
+ lgrp_optimizations() && svntr_hashtab != NULL &&
+ a->type == MAP_PRIVATE && swresv == 0 &&
+ !(a->flags & MAP_NORESERVE) &&
+ seg->s_as != &kas && a->vp->v_type == VREG);
}
/*
- * If more than one segment in the address space, and
- * they're adjacent virtually, try to concatenate them.
- * Don't concatenate if an explicit anon_map structure
- * was supplied (e.g., SystemV shared memory).
+ * If more than one segment in the address space, and they're adjacent
+ * virtually, try to concatenate them. Don't concatenate if an
+ * explicit anon_map structure was supplied (e.g., SystemV shared
+ * memory) or if we'll use text replication for this segment.
*/
- if (a->amp == NULL) {
+ if (a->amp == NULL && !trok) {
struct seg *pseg, *nseg;
struct segvn_data *psvd, *nsvd;
lgrp_mem_policy_t ppolicy, npolicy;
@@ -490,7 +602,7 @@ segvn_create(struct seg *seg, void *argsp)
* extending stack/heap segments.
*/
if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
- !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
+ !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
} else {
/*
@@ -602,6 +714,7 @@ segvn_create(struct seg *seg, void *argsp)
seg->s_data = (void *)svd;
seg->s_szc = a->szc;
+ svd->seg = seg;
svd->vp = a->vp;
/*
* Anonymous mappings have no backing file so the offset is meaningless.
@@ -620,6 +733,11 @@ segvn_create(struct seg *seg, void *argsp)
if (a->szc != 0 && a->vp != NULL) {
segvn_setvnode_mpss(a->vp);
}
+ if (svd->type == MAP_SHARED && svd->vp != NULL &&
+ (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
+ ASSERT(vn_is_mapped(svd->vp, V_WRITE));
+ segvn_inval_trcache(svd->vp);
+ }
amp = a->amp;
if ((svd->amp = amp) == NULL) {
@@ -634,7 +752,8 @@ segvn_create(struct seg *seg, void *argsp)
* by remembering the swap reservation there.
*/
if (a->vp == NULL) {
- svd->amp = anonmap_alloc(seg->s_size, swresv);
+ svd->amp = anonmap_alloc(seg->s_size, swresv,
+ ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
}
} else {
@@ -696,7 +815,7 @@ segvn_create(struct seg *seg, void *argsp)
hat_flag |= HAT_LOAD_TEXT;
}
- svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
svd->anon_index = 0;
svd->swresv = swresv;
@@ -763,6 +882,9 @@ segvn_create(struct seg *seg, void *argsp)
(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
svd->vp, svd->offset, seg->s_size);
+ ASSERT(!trok || !(svd->prot & PROT_WRITE));
+ svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
+
return (0);
}
@@ -808,6 +930,13 @@ segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
}
/*
+ * Don't concatenate if either segment uses text replication.
+ */
+ if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
+ return (-1);
+ }
+
+ /*
* Fail early if we're not supposed to concatenate
* segments with non NULL amp.
*/
@@ -1010,6 +1139,10 @@ segvn_extend_prev(seg1, seg2, a, swresv)
svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
return (-1);
+ if (svd1->tr_state != SEGVN_TR_OFF) {
+ return (-1);
+ }
+
amp1 = svd1->amp;
if (amp1) {
pgcnt_t newpgs;
@@ -1071,6 +1204,12 @@ segvn_extend_prev(seg1, seg2, a, swresv)
seg_free(seg2);
seg1->s_size += size;
svd1->swresv += swresv;
+ if (svd1->pageprot && (a->prot & PROT_WRITE) &&
+ svd1->type == MAP_SHARED && svd1->vp != NULL &&
+ (svd1->vp->v_flag & VVMEXEC)) {
+ ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
+ segvn_inval_trcache(svd1->vp);
+ }
return (0);
}
@@ -1109,6 +1248,10 @@ segvn_extend_next(
(a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
return (-1);
+ if (svd2->tr_state != SEGVN_TR_OFF) {
+ return (-1);
+ }
+
amp2 = svd2->amp;
if (amp2) {
pgcnt_t newpgs;
@@ -1173,6 +1316,12 @@ segvn_extend_next(
seg2->s_base -= size;
svd2->offset -= size;
svd2->swresv += swresv;
+ if (svd2->pageprot && (a->prot & PROT_WRITE) &&
+ svd2->type == MAP_SHARED && svd2->vp != NULL &&
+ (svd2->vp->v_flag & VVMEXEC)) {
+ ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
+ segvn_inval_trcache(svd2->vp);
+ }
return (0);
}
@@ -1185,6 +1334,7 @@ segvn_dup(struct seg *seg, struct seg *newseg)
int error = 0;
uint_t prot;
size_t len;
+ struct anon_map *amp;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
@@ -1210,6 +1360,7 @@ segvn_dup(struct seg *seg, struct seg *newseg)
newseg->s_data = (void *)newsvd;
newseg->s_szc = seg->s_szc;
+ newsvd->seg = newseg;
if ((newsvd->vp = svd->vp) != NULL) {
VN_HOLD(svd->vp);
if (svd->type == MAP_SHARED)
@@ -1228,16 +1379,23 @@ segvn_dup(struct seg *seg, struct seg *newseg)
newsvd->flags = svd->flags;
newsvd->softlockcnt = 0;
newsvd->policy_info = svd->policy_info;
- if ((newsvd->amp = svd->amp) == NULL) {
+ if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
/*
* Not attaching to a shared anon object.
*/
+ if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(newsvd->vp != NULL && amp != NULL);
+ newsvd->tr_state = SEGVN_TR_INIT;
+ } else {
+ newsvd->tr_state = svd->tr_state;
+ }
+ newsvd->amp = NULL;
newsvd->anon_index = 0;
} else {
- struct anon_map *amp;
-
- amp = svd->amp;
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
+ newsvd->tr_state = SEGVN_TR_OFF;
if (svd->type == MAP_SHARED) {
+ newsvd->amp = amp;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
amp->refcnt++;
ANON_LOCK_EXIT(&amp->a_rwlock);
@@ -1248,7 +1406,8 @@ segvn_dup(struct seg *seg, struct seg *newseg)
/*
* Allocate and initialize new anon_map structure.
*/
- newsvd->amp = anonmap_alloc(newseg->s_size, 0);
+ newsvd->amp = anonmap_alloc(newseg->s_size, 0,
+ ANON_SLEEP);
newsvd->amp->a_szc = newseg->s_szc;
newsvd->anon_index = 0;
@@ -1438,6 +1597,7 @@ segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
size_t nsize;
size_t oswresv;
int reclaim = 1;
+ int unmap = 1;
/*
* We don't need any segment level locks for "segvn" data
@@ -1451,6 +1611,7 @@ segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
*/
retry:
if (svd->softlockcnt > 0) {
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
/*
* since we do have the writers lock nobody can fill
* the cache during the purge. The flush either succeeds
@@ -1478,6 +1639,14 @@ retry:
int err;
if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
ASSERT(seg->s_base != addr || seg->s_size != len);
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ segvn_textunrepl(seg, 1);
+ ASSERT(svd->amp == NULL);
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
+ }
VM_STAT_ADD(segvnvmstats.demoterange[0]);
err = segvn_demote_range(seg, addr, len, SDR_END, 0);
if (err == 0) {
@@ -1499,21 +1668,41 @@ retry:
if (error == EAGAIN)
return (error);
}
+
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
+ segvn_textunrepl(seg, 1);
+ ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
+ unmap = 0;
+ }
+
/*
* Remove any page locks set through this mapping.
*/
(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
- /*
- * Unload any hardware translations in the range to be taken out.
- * Use a callback to invoke free_vp_pages() effectively.
- */
- if (svd->vp != NULL && free_pages != 0) {
- callback.hcb_data = seg;
- callback.hcb_function = segvn_hat_unload_callback;
- cbp = &callback;
+ if (unmap) {
+ /*
+ * Unload any hardware translations in the range to be taken
+ * out. Use a callback to invoke free_vp_pages() effectively.
+ */
+ if (svd->vp != NULL && free_pages != 0) {
+ callback.hcb_data = seg;
+ callback.hcb_function = segvn_hat_unload_callback;
+ cbp = &callback;
+ }
+ hat_unload_callback(seg->s_as->a_hat, addr, len,
+ HAT_UNLOAD_UNMAP, cbp);
+
+ if (svd->type == MAP_SHARED && svd->vp != NULL &&
+ (svd->vp->v_flag & VVMEXEC) &&
+ ((svd->prot & PROT_WRITE) || svd->pageprot)) {
+ segvn_inval_trcache(svd->vp);
+ }
}
- hat_unload_callback(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP, cbp);
/*
* Check for entire segment
@@ -1697,6 +1886,7 @@ retry:
nseg->s_data = (void *)nsvd;
nseg->s_szc = seg->s_szc;
*nsvd = *svd;
+ nsvd->seg = nseg;
nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
nsvd->swresv = 0;
nsvd->softlockcnt = 0;
@@ -1784,7 +1974,7 @@ retry:
ASSERT(svd->type == MAP_PRIVATE);
nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
- namp = anonmap_alloc(nseg->s_size, 0);
+ namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
namp->a_szc = seg->s_szc;
(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
0, btop(seg->s_size), ANON_SLEEP);
@@ -1839,6 +2029,7 @@ segvn_free(struct seg *seg)
* since the address space is "write" locked.
*/
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
/*
* Be sure to unlock pages. XXX Why do things get free'ed instead
@@ -2496,7 +2687,20 @@ segvn_faultpage(
* that fatal protection checks have already been made.
*/
- cow = brkcow && ((vpprot & PROT_WRITE) == 0);
+ if (brkcow) {
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
+ cow = !(vpprot & PROT_WRITE);
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ /*
+ * If we are doing text replication COW on first touch.
+ */
+ ASSERT(amp != NULL);
+ ASSERT(svd->vp != NULL);
+ ASSERT(rw != S_WRITE);
+ cow = (ap == NULL);
+ } else {
+ cow = 0;
+ }
/*
* If not a copy-on-write case load the translation
@@ -3422,10 +3626,12 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
int physcontig;
int upgrdfail;
int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
+ int tron = (svd->tr_state == SEGVN_TR_ON);
ASSERT(szc != 0);
ASSERT(vp != NULL);
ASSERT(brkcow == 0 || amp != NULL);
+ ASSERT(tron == 0 || amp != NULL);
ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
ASSERT(!(svd->flags & MAP_NORESERVE));
ASSERT(type != F_SOFTUNLOCK);
@@ -3509,11 +3715,8 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
anon_array_enter(amp, aindx, &an_cookie);
if (anon_get_ptr(amp->ahp, aindx) != NULL) {
SEGVN_VMSTAT_FLTVNPAGES(5);
- if (anon_pages(amp->ahp, aindx,
- maxpages) != maxpages) {
- panic("segvn_fault_vnodepages:"
- " empty anon slots\n");
- }
+ ASSERT(anon_pages(amp->ahp, aindx,
+ maxpages) == maxpages);
anon_array_exit(&an_cookie);
ANON_LOCK_EXIT(&amp->a_rwlock);
err = segvn_fault_anonpages(hat, seg,
@@ -3531,17 +3734,16 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
lpgeaddr = maxlpgeaddr;
}
goto next;
- } else if (anon_pages(amp->ahp, aindx,
- maxpages)) {
- panic("segvn_fault_vnodepages:"
- " non empty anon slots\n");
} else {
+ ASSERT(anon_pages(amp->ahp, aindx,
+ maxpages) == 0);
SEGVN_VMSTAT_FLTVNPAGES(7);
anon_array_exit(&an_cookie);
ANON_LOCK_EXIT(&amp->a_rwlock);
}
}
ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
+ ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz));
if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
ASSERT(vpage != NULL);
@@ -3570,12 +3772,12 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
pplist = NULL;
physcontig = 0;
ppa[0] = NULL;
- if (!brkcow && szc &&
+ if (!brkcow && !tron && szc &&
!page_exists_physcontig(vp, off, szc,
segtype == MAP_PRIVATE ? ppa : NULL)) {
SEGVN_VMSTAT_FLTVNPAGES(9);
if (page_alloc_pages(vp, seg, a, &pplist, NULL,
- szc, 0) && type != F_SOFTLOCK) {
+ szc, 0, 0) && type != F_SOFTLOCK) {
SEGVN_VMSTAT_FLTVNPAGES(10);
pszc = 0;
ierr = -1;
@@ -3604,7 +3806,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
physcontig = 0;
}
}
- } else if (!brkcow && szc && ppa[0] != NULL) {
+ } else if (!brkcow && !tron && szc && ppa[0] != NULL) {
SEGVN_VMSTAT_FLTVNPAGES(13);
ASSERT(segtype == MAP_PRIVATE);
physcontig = 1;
@@ -3668,7 +3870,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
err = FC_MAKE_ERR(ierr);
goto out;
}
- if (brkcow || type == F_SOFTLOCK) {
+ if (brkcow || tron || type == F_SOFTLOCK) {
/* can't reduce map area */
SEGVN_VMSTAT_FLTVNPAGES(23);
vop_size_err = 1;
@@ -3690,11 +3892,8 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
ulong_t taindx = P2ALIGN(aindx, maxpages);
SEGVN_VMSTAT_FLTVNPAGES(25);
- if (anon_pages(amp->ahp, taindx, maxpages) !=
- maxpages) {
- panic("segvn_fault_vnodepages:"
- " empty anon slots\n");
- }
+ ASSERT(anon_pages(amp->ahp, taindx,
+ maxpages) == maxpages);
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
}
@@ -3717,9 +3916,12 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
* Therefore if we are here for
* SOFTLOCK case it must be a cow
* break but cow break never reduces
- * szc. Thus the assert below.
+ * szc. text replication (tron) in
+ * this case works as cow break.
+ * Thus the assert below.
*/
- ASSERT(!brkcow && type != F_SOFTLOCK);
+ ASSERT(!brkcow && !tron &&
+ type != F_SOFTLOCK);
pszc = seg->s_szc;
ierr = -2;
break;
@@ -3734,7 +3936,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
}
#endif /* DEBUG */
- if (brkcow) {
+ if (brkcow || tron) {
ASSERT(amp != NULL);
ASSERT(pplist == NULL);
ASSERT(szc == seg->s_szc);
@@ -3743,7 +3945,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
SEGVN_VMSTAT_FLTVNPAGES(27);
ierr = anon_map_privatepages(amp, aindx, szc,
seg, a, prot, ppa, vpage, segvn_anypgsz,
- svd->cred);
+ tron ? PG_LOCAL : 0, svd->cred);
if (ierr != 0) {
SEGVN_VMSTAT_FLTVNPAGES(28);
anon_array_exit(&an_cookie);
@@ -4032,7 +4234,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
*/
if (pplist == NULL &&
page_alloc_pages(vp, seg, a, &pplist, NULL,
- szc, 0) && type != F_SOFTLOCK) {
+ szc, 0, 0) && type != F_SOFTLOCK) {
SEGVN_VMSTAT_FLTVNPAGES(38);
for (i = 0; i < pages; i++) {
page_unlock(ppa[i]);
@@ -4092,7 +4294,7 @@ segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
break;
ASSERT(a < lpgeaddr);
- ASSERT(!brkcow && type != F_SOFTLOCK);
+ ASSERT(!brkcow && !tron && type != F_SOFTLOCK);
/*
* ierr == -1 means we failed to map with a large page.
@@ -4178,14 +4380,14 @@ out:
SEGVN_VMSTAT_FLTVNPAGES(47);
return (err);
}
- ASSERT(brkcow || type == F_SOFTLOCK);
+ ASSERT(brkcow || tron || type == F_SOFTLOCK);
/*
* Large page end is mapped beyond the end of file and it's a cow
- * fault or softlock so we can't reduce the map area. For now just
- * demote the segment. This should really only happen if the end of
- * the file changed after the mapping was established since when large
- * page segments are created we make sure they don't extend beyond the
- * end of the file.
+ * fault (can be a text replication induced cow) or softlock so we can't
+ * reduce the map area. For now just demote the segment. This should
+ * really only happen if the end of the file changed after the mapping
+ * was established since when large page segments are created we make
+ * sure they don't extend beyond the end of the file.
*/
SEGVN_VMSTAT_FLTVNPAGES(48);
@@ -4239,6 +4441,7 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
int first = 1;
int adjszc_chk;
int purged = 0;
+ int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;
ASSERT(szc != 0);
ASSERT(amp != NULL);
@@ -4246,6 +4449,7 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
ASSERT(!(svd->flags & MAP_NORESERVE));
ASSERT(type != F_SOFTUNLOCK);
ASSERT(IS_P2ALIGNED(a, maxpgsz));
+ ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF);
ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
@@ -4320,7 +4524,7 @@ segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
ppa_szc = (uint_t)-1;
ierr = anon_map_getpages(amp, aindx, szc, seg, a,
prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
- segvn_anypgsz, svd->cred);
+ segvn_anypgsz, pgflags, svd->cred);
if (ierr != 0) {
anon_array_exit(&cookie);
VM_STAT_ADD(segvnvmstats.fltanpages[4]);
@@ -4582,6 +4786,34 @@ segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
return (0);
}
+ if (brkcow == 0) {
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ ASSERT(svd->vp != NULL && svd->amp == NULL);
+ ASSERT(svd->flags & MAP_TEXT);
+ ASSERT(svd->type == MAP_PRIVATE);
+ segvn_textrepl(seg);
+ ASSERT(svd->tr_state != SEGVN_TR_INIT);
+ ASSERT(svd->tr_state != SEGVN_TR_ON ||
+ svd->amp != NULL);
+ }
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ }
+ } else if (svd->tr_state != SEGVN_TR_OFF) {
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
+ if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->vp != NULL && svd->amp != NULL);
+ segvn_textunrepl(seg, 0);
+ ASSERT(svd->amp == NULL &&
+ svd->tr_state == SEGVN_TR_OFF);
+ } else if (svd->tr_state != SEGVN_TR_OFF) {
+ svd->tr_state = SEGVN_TR_OFF;
+ }
+ ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ }
+
top:
SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
@@ -4692,7 +4924,7 @@ top:
SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
if (svd->amp == NULL) {
- svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
}
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
@@ -4745,7 +4977,8 @@ top:
if (amp != NULL) {
anon_index = svd->anon_index + page;
- if ((type == F_PROT) && (rw == S_READ) &&
+ if (type == F_PROT && rw == S_READ &&
+ svd->tr_state == SEGVN_TR_OFF &&
svd->type == MAP_PRIVATE && svd->pageprot == 0) {
size_t index = anon_index;
struct anon *ap;
@@ -4789,7 +5022,8 @@ slow:
* are faulting on, free behind all pages in the segment and put
* them on the free list.
*/
- if ((page != 0) && fltadvice) { /* not if first page in segment */
+
+ if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) {
struct vpage *vpp;
ulong_t fanon_index;
size_t fpage;
@@ -4939,7 +5173,7 @@ slow:
plp[0] = NULL;
plsz = len;
} else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
- rw == S_OTHER ||
+ svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
(((size_t)(addr + PAGESIZE) <
(size_t)(seg->s_base + seg->s_size)) &&
hat_probe(as->a_hat, addr + PAGESIZE))) {
@@ -5101,8 +5335,9 @@ slow:
if (pp == PAGE_HANDLED)
continue;
- if (pp->p_offset >= svd->offset &&
- (pp->p_offset < svd->offset + seg->s_size)) {
+ if (svd->tr_state != SEGVN_TR_ON &&
+ pp->p_offset >= svd->offset &&
+ pp->p_offset < svd->offset + seg->s_size) {
diff = pp->p_offset - svd->offset;
@@ -5249,6 +5484,7 @@ segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
* protections.
*/
if (svd->softlockcnt > 0) {
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
@@ -5262,6 +5498,20 @@ segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
}
}
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ segvn_textunrepl(seg, 0);
+ ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
+ }
+
+ if ((prot & PROT_WRITE) && svd->type == MAP_SHARED &&
+ svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) {
+ ASSERT(vn_is_mapped(svd->vp, V_WRITE));
+ segvn_inval_trcache(svd->vp);
+ }
+
if (seg->s_szc != 0) {
int err;
pgsz = page_get_pagesize(seg->s_szc);
@@ -5583,6 +5833,7 @@ segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
* to recheck protections.
*/
if (svd->softlockcnt > 0) {
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
/*
* Since we do have the segvn writers lock nobody can fill
* the cache with entries belonging to this seg during
@@ -5595,6 +5846,15 @@ segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
}
}
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ segvn_textunrepl(seg, 1);
+ ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
+ amp = NULL;
+ }
+
/*
* Operation for sub range of existing segment.
*/
@@ -5766,6 +6026,7 @@ segvn_clrszc(struct seg *seg)
struct anon *ap, *oldap;
uint_t prot = svd->prot, vpprot;
int pageflag = 0;
+ int unmap = 1;
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
@@ -5775,13 +6036,25 @@ segvn_clrszc(struct seg *seg)
return (0);
}
- /*
- * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
- * unload argument is 0 when we are freeing the segment
- * and unload was already done.
- */
- hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
- HAT_UNLOAD_UNMAP);
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ segvn_textunrepl(seg, 1);
+ ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
+ amp = NULL;
+ unmap = 0;
+ }
+
+ if (unmap) {
+ /*
+ * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
+ * unload argument is 0 when we are freeing the segment
+ * and unload was already done.
+ */
+ hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
+ HAT_UNLOAD_UNMAP);
+ }
if (amp == NULL || svd->type == MAP_SHARED) {
seg->s_szc = 0;
@@ -5944,6 +6217,8 @@ segvn_split_seg(struct seg *seg, caddr_t addr)
struct segvn_data *nsvd;
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
+
ASSERT(addr >= seg->s_base);
ASSERT(addr <= seg->s_base + seg->s_size);
@@ -5959,6 +6234,7 @@ segvn_split_seg(struct seg *seg, caddr_t addr)
nseg->s_data = (void *)nsvd;
nseg->s_szc = seg->s_szc;
*nsvd = *svd;
+ nsvd->seg = nseg;
rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
if (nsvd->vp != NULL) {
@@ -6006,7 +6282,7 @@ segvn_split_seg(struct seg *seg, caddr_t addr)
(void) anon_copy_ptr(oamp->ahp, svd->anon_index,
nahp, 0, btop(seg->s_size), ANON_SLEEP);
- namp = anonmap_alloc(nseg->s_size, 0);
+ namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
namp->a_szc = nseg->s_szc;
(void) anon_copy_ptr(oamp->ahp,
svd->anon_index + btop(seg->s_size),
@@ -6085,6 +6361,7 @@ segvn_demote_range(
uint_t tszcvec;
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
ASSERT(szc != 0);
pgsz = page_get_pagesize(szc);
ASSERT(seg->s_base != addr || seg->s_size != len);
@@ -6990,6 +7267,8 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
/* Determine if this segment backs a sysV shm */
if (svd->amp != NULL && svd->amp->a_sp != NULL) {
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(svd->tr_state == SEGVN_TR_OFF);
sp = svd->amp->a_sp;
proj = sp->shm_perm.ipc_proj;
chargeproc = 0;
@@ -7015,6 +7294,17 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
}
}
+ if (op == MC_LOCK) {
+ if (svd->tr_state == SEGVN_TR_INIT) {
+ svd->tr_state = SEGVN_TR_OFF;
+ } else if (svd->tr_state == SEGVN_TR_ON) {
+ ASSERT(svd->amp != NULL);
+ segvn_textunrepl(seg, 0);
+ ASSERT(svd->amp == NULL &&
+ svd->tr_state == SEGVN_TR_OFF);
+ }
+ }
+
/*
* If we're locking, then we must create a vpage structure if
* none exists. If we're unlocking, then check to see if there
@@ -7036,7 +7326,7 @@ segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
* by lazily testing for its existence.
*/
if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) {
- svd->amp = anonmap_alloc(seg->s_size, 0);
+ svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
}
@@ -7371,10 +7661,15 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
* In case of MADV_FREE, we won't be modifying any segment private
* data structures; so, we only need to grab READER's lock
*/
- if (behav != MADV_FREE)
+ if (behav != MADV_FREE) {
SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
- else
+ if (svd->tr_state != SEGVN_TR_OFF) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ return (0);
+ }
+ } else {
SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ }
/*
* Large pages are assumed to be only turned on when accesses to the
@@ -7433,7 +7728,7 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
* us to do. As MADV_FREE is advisory, we don't
* return error in either case.
*/
- if (vp || amp == NULL) {
+ if (vp != NULL || amp == NULL) {
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
return (0);
}
@@ -8368,9 +8663,15 @@ segvn_getpolicy(struct seg *seg, caddr_t addr)
/*
* Get policy info for private or shared memory
*/
- if (svn_data->type != MAP_SHARED)
- policy_info = &svn_data->policy_info;
- else {
+ if (svn_data->type != MAP_SHARED) {
+ if (svn_data->tr_state != SEGVN_TR_ON) {
+ policy_info = &svn_data->policy_info;
+ } else {
+ policy_info = &svn_data->tr_policy_info;
+ ASSERT(policy_info->mem_policy ==
+ LGRP_MEM_POLICY_NEXT_SEG);
+ }
+ } else {
amp = svn_data->amp;
anon_index = svn_data->anon_index + seg_page(seg, addr);
vp = svn_data->vp;
@@ -8387,3 +8688,602 @@ segvn_capable(struct seg *seg, segcapability_t capability)
{
return (0);
}
+
+/*
+ * Bind text vnode segment to an amp. If we bind successfully mappings will be
+ * established to per vnode mapping per lgroup amp pages instead of to vnode
+ * pages. There's one amp per vnode text mapping per lgroup. Many processes
+ * may share the same text replication amp. If a suitable amp doesn't already
+ * exist in svntr hash table create a new one. We may fail to bind to amp if
+ * segment is not eligible for text replication. Code below first checks for
+ * these conditions. If binding is successful segment tr_state is set to on
+ * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
+ * svd->amp remains as NULL.
+ */
+static void
+segvn_textrepl(struct seg *seg)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ vnode_t *vp = svd->vp;
+ u_offset_t off = svd->offset;
+ size_t size = seg->s_size;
+ u_offset_t eoff = off + size;
+ uint_t szc = seg->s_szc;
+ ulong_t hash = SVNTR_HASH_FUNC(vp);
+ svntr_t *svntrp;
+ struct vattr va;
+ proc_t *p = seg->s_as->a_proc;
+ lgrp_id_t lgrp_id;
+ lgrp_id_t olid;
+ int first;
+ struct anon_map *amp;
+
+ ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+ ASSERT(p != NULL);
+ ASSERT(svd->tr_state == SEGVN_TR_INIT);
+ ASSERT(svd->flags & MAP_TEXT);
+ ASSERT(svd->type == MAP_PRIVATE);
+ ASSERT(vp != NULL && svd->amp == NULL);
+ ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
+ ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0);
+ ASSERT(seg->s_as != &kas);
+ ASSERT(off < eoff);
+ ASSERT(svntr_hashtab != NULL);
+
+ /*
+ * If numa optimizations are no longer desired bail out.
+ */
+ if (!lgrp_optimizations()) {
+ svd->tr_state = SEGVN_TR_OFF;
+ return;
+ }
+
+ /*
+ * Avoid creating anon maps with size bigger than the file size.
+ * If VOP_GETATTR() call fails bail out.
+ */
+ va.va_mask = AT_SIZE | AT_MTIME;
+ if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) {
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(gaerr);
+ return;
+ }
+ if (btopr(va.va_size) < btopr(eoff)) {
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(overmap);
+ return;
+ }
+
+ /*
+ * VVMEXEC may not be set yet if exec() prefaults text segment. Set
+ * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
+ * mapping that checks if trcache for this vnode needs to be
+ * invalidated can't miss us.
+ */
+ if (!(vp->v_flag & VVMEXEC)) {
+ mutex_enter(&vp->v_lock);
+ vp->v_flag |= VVMEXEC;
+ mutex_exit(&vp->v_lock);
+ }
+ mutex_enter(&svntr_hashtab[hash].tr_lock);
+ /*
+ * Bail out if potentially MAP_SHARED writable mappings exist to this
+ * vnode. We don't want to use old file contents from existing
+ * replicas if this mapping was established after the original file
+ * was changed.
+ */
+ if (vn_is_mapped(vp, V_WRITE)) {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(wrcnt);
+ return;
+ }
+ svntrp = svntr_hashtab[hash].tr_head;
+ for (; svntrp != NULL; svntrp = svntrp->tr_next) {
+ ASSERT(svntrp->tr_refcnt != 0);
+ if (svntrp->tr_vp != vp) {
+ continue;
+ }
+ /*
+ * Bail out if file was changed after this replication entry
+ * was created since we need to use the latest file contents.
+ */
+ if (!svntrp->tr_valid ||
+ svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec ||
+ svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec) {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(stale);
+ return;
+ }
+ /*
+ * if off, eoff and szc match current segment we found the
+ * existing entry we can use.
+ */
+ if (svntrp->tr_off == off && svntrp->tr_eoff == eoff &&
+ svntrp->tr_szc == szc) {
+ break;
+ }
+ /*
+ * Don't create different but overlapping in file offsets
+ * entries to avoid replication of the same file pages more
+ * than once per lgroup.
+ */
+ if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) ||
+ (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(overlap);
+ return;
+ }
+ }
+ /*
+ * If we didn't find existing entry create a new one.
+ */
+ if (svntrp == NULL) {
+ svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP);
+ if (svntrp == NULL) {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ svd->tr_state = SEGVN_TR_OFF;
+ SEGVN_TR_ADDSTAT(nokmem);
+ return;
+ }
+#ifdef DEBUG
+ {
+ lgrp_id_t i;
+ for (i = 0; i < NLGRPS_MAX; i++) {
+ ASSERT(svntrp->tr_amp[i] == NULL);
+ }
+ }
+#endif /* DEBUG */
+ svntrp->tr_vp = vp;
+ svntrp->tr_off = off;
+ svntrp->tr_eoff = eoff;
+ svntrp->tr_szc = szc;
+ svntrp->tr_valid = 1;
+ svntrp->tr_mtime = va.va_mtime;
+ svntrp->tr_refcnt = 0;
+ svntrp->tr_next = svntr_hashtab[hash].tr_head;
+ svntr_hashtab[hash].tr_head = svntrp;
+ }
+ first = 1;
+again:
+ /*
+ * We want to pick a replica with pages on main thread's (t_tid = 1,
+ * aka T1) lgrp. Currently text replication is only optimized for
+ * workloads that either have all threads of a process on the same
+ * lgrp or execute their large text primarily on main thread.
+ */
+ lgrp_id = p->p_t1_lgrpid;
+ if (lgrp_id == LGRP_NONE) {
+ /*
+ * In case exec() prefaults text on non main thread use
+ * current thread lgrpid. It will become main thread anyway
+ * soon.
+ */
+ lgrp_id = lgrp_home_id(curthread);
+ }
+ /*
+ * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise
+ * just set it to NLGRPS_MAX if it's different from current process T1
+ * home lgrp. p_tr_lgrpid is used to detect if process uses text
+ * replication and T1 new home is different from lgrp used for text
+ * replication. When this happens asyncronous segvn thread rechecks if
+ * segments should change lgrps used for text replication. If we fail
+ * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas
+ * if it's not already NLGRPS_MAX and not equal lgrp_id we want to
+ * use. We don't need to use cas in this case because another thread
+ * that races in between our non atomic check and set may only change
+ * p_tr_lgrpid to NLGRPS_MAX at this point.
+ */
+ ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
+ olid = p->p_tr_lgrpid;
+ if (lgrp_id != olid && olid != NLGRPS_MAX) {
+ lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX;
+ if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) {
+ olid = p->p_tr_lgrpid;
+ ASSERT(olid != LGRP_NONE);
+ if (olid != lgrp_id && olid != NLGRPS_MAX) {
+ p->p_tr_lgrpid = NLGRPS_MAX;
+ }
+ }
+ ASSERT(p->p_tr_lgrpid != LGRP_NONE);
+ membar_producer();
+ /*
+ * lgrp_move_thread() won't schedule async recheck after
+ * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
+ * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
+ * is not LGRP_NONE.
+ */
+ if (first && p->p_t1_lgrpid != LGRP_NONE &&
+ p->p_t1_lgrpid != lgrp_id) {
+ first = 0;
+ goto again;
+ }
+ }
+ /*
+ * If no amp was created yet for lgrp_id create a new one as long as
+ * we have enough memory to afford it.
+ */
+ if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) {
+ size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
+ if (trmem > segvn_textrepl_max_bytes) {
+ SEGVN_TR_ADDSTAT(normem);
+ goto fail;
+ }
+ if (anon_try_resv_zone(size, NULL) == 0) {
+ SEGVN_TR_ADDSTAT(noanon);
+ goto fail;
+ }
+ amp = anonmap_alloc(size, size, ANON_NOSLEEP);
+ if (amp == NULL) {
+ anon_unresv_zone(size, NULL);
+ SEGVN_TR_ADDSTAT(nokmem);
+ goto fail;
+ }
+ ASSERT(amp->refcnt == 1);
+ amp->a_szc = szc;
+ svntrp->tr_amp[lgrp_id] = amp;
+ SEGVN_TR_ADDSTAT(newamp);
+ }
+ svntrp->tr_refcnt++;
+ ASSERT(svd->svn_trnext == NULL);
+ ASSERT(svd->svn_trprev == NULL);
+ svd->svn_trnext = svntrp->tr_svnhead;
+ svd->svn_trprev = NULL;
+ if (svntrp->tr_svnhead != NULL) {
+ svntrp->tr_svnhead->svn_trprev = svd;
+ }
+ svntrp->tr_svnhead = svd;
+ ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size);
+ ASSERT(amp->refcnt >= 1);
+ svd->amp = amp;
+ svd->anon_index = 0;
+ svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG;
+ svd->tr_policy_info.mem_lgrpid = lgrp_id;
+ svd->tr_state = SEGVN_TR_ON;
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ SEGVN_TR_ADDSTAT(repl);
+ return;
+fail:
+ ASSERT(segvn_textrepl_bytes >= size);
+ atomic_add_long(&segvn_textrepl_bytes, -size);
+ ASSERT(svntrp != NULL);
+ ASSERT(svntrp->tr_amp[lgrp_id] == NULL);
+ if (svntrp->tr_refcnt == 0) {
+ ASSERT(svntrp == svntr_hashtab[hash].tr_head);
+ svntr_hashtab[hash].tr_head = svntrp->tr_next;
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ kmem_cache_free(svntr_cache, svntrp);
+ } else {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ }
+ svd->tr_state = SEGVN_TR_OFF;
+}
+
+/*
+ * Convert seg back to regular vnode mapping seg by unbinding it from its text
+ * replication amp. This routine is most typically called when segment is
+ * unmapped but can also be called when segment no longer qualifies for text
+ * replication (e.g. due to protection changes). If unload_unmap is set use
+ * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of
+ * svntr free all its anon maps and remove it from the hash table.
+ */
+static void
+segvn_textunrepl(struct seg *seg, int unload_unmap)
+{
+ struct segvn_data *svd = (struct segvn_data *)seg->s_data;
+ vnode_t *vp = svd->vp;
+ u_offset_t off = svd->offset;
+ size_t size = seg->s_size;
+ u_offset_t eoff = off + size;
+ uint_t szc = seg->s_szc;
+ ulong_t hash = SVNTR_HASH_FUNC(vp);
+ svntr_t *svntrp;
+ svntr_t **prv_svntrp;
+ lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid;
+ lgrp_id_t i;
+
+ ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
+ ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
+ SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
+ ASSERT(svd->tr_state == SEGVN_TR_ON);
+ ASSERT(svd->amp != NULL);
+ ASSERT(svd->amp->refcnt >= 1);
+ ASSERT(svd->anon_index == 0);
+ ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
+ ASSERT(svntr_hashtab != NULL);
+
+ mutex_enter(&svntr_hashtab[hash].tr_lock);
+ prv_svntrp = &svntr_hashtab[hash].tr_head;
+ for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) {
+ ASSERT(svntrp->tr_refcnt != 0);
+ if (svntrp->tr_vp == vp && svntrp->tr_off == off &&
+ svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) {
+ break;
+ }
+ }
+ if (svntrp == NULL) {
+ panic("segvn_textunrepl: svntr record not found");
+ }
+ if (svntrp->tr_amp[lgrp_id] != svd->amp) {
+ panic("segvn_textunrepl: amp mismatch");
+ }
+ svd->tr_state = SEGVN_TR_OFF;
+ svd->amp = NULL;
+ if (svd->svn_trprev == NULL) {
+ ASSERT(svntrp->tr_svnhead == svd);
+ svntrp->tr_svnhead = svd->svn_trnext;
+ if (svntrp->tr_svnhead != NULL) {
+ svntrp->tr_svnhead->svn_trprev = NULL;
+ }
+ svd->svn_trnext = NULL;
+ } else {
+ svd->svn_trprev->svn_trnext = svd->svn_trnext;
+ if (svd->svn_trnext != NULL) {
+ svd->svn_trnext->svn_trprev = svd->svn_trprev;
+ svd->svn_trnext = NULL;
+ }
+ svd->svn_trprev = NULL;
+ }
+ if (--svntrp->tr_refcnt) {
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ goto done;
+ }
+ *prv_svntrp = svntrp->tr_next;
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ for (i = 0; i < NLGRPS_MAX; i++) {
+ struct anon_map *amp = svntrp->tr_amp[i];
+ if (amp == NULL) {
+ continue;
+ }
+ ASSERT(amp->refcnt == 1);
+ ASSERT(amp->swresv == size);
+ ASSERT(amp->size == size);
+ ASSERT(amp->a_szc == szc);
+ if (amp->a_szc != 0) {
+ anon_free_pages(amp->ahp, 0, size, szc);
+ } else {
+ anon_free(amp->ahp, 0, size);
+ }
+ svntrp->tr_amp[i] = NULL;
+ ASSERT(segvn_textrepl_bytes >= size);
+ atomic_add_long(&segvn_textrepl_bytes, -size);
+ anon_unresv_zone(amp->swresv, NULL);
+ amp->refcnt = 0;
+ anonmap_free(amp);
+ }
+ kmem_cache_free(svntr_cache, svntrp);
+done:
+ hat_unload_callback(seg->s_as->a_hat, seg->s_base, size,
+ unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL);
+}
+
+/*
+ * This is called when a MAP_SHARED writabble mapping is created to a vnode
+ * that is currently used for execution (VVMEXEC flag is set). In this case we
+ * need to prevent further use of existing replicas.
+ */
+static void
+segvn_inval_trcache(vnode_t *vp)
+{
+ ulong_t hash = SVNTR_HASH_FUNC(vp);
+ svntr_t *svntrp;
+
+ ASSERT(vp->v_flag & VVMEXEC);
+
+ if (svntr_hashtab == NULL) {
+ return;
+ }
+
+ mutex_enter(&svntr_hashtab[hash].tr_lock);
+ svntrp = svntr_hashtab[hash].tr_head;
+ for (; svntrp != NULL; svntrp = svntrp->tr_next) {
+ ASSERT(svntrp->tr_refcnt != 0);
+ if (svntrp->tr_vp == vp && svntrp->tr_valid) {
+ svntrp->tr_valid = 0;
+ }
+ }
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+}
+
+static void
+segvn_trasync_thread(void)
+{
+ callb_cpr_t cpr_info;
+ kmutex_t cpr_lock; /* just for CPR stuff */
+
+ mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ CALLB_CPR_INIT(&cpr_info, &cpr_lock,
+ callb_generic_cpr, "segvn_async");
+
+ if (segvn_update_textrepl_interval == 0) {
+ segvn_update_textrepl_interval = segvn_update_tr_time * hz;
+ } else {
+ segvn_update_textrepl_interval *= hz;
+ }
+ (void) timeout(segvn_trupdate_wakeup, NULL,
+ segvn_update_textrepl_interval);
+
+ for (;;) {
+ mutex_enter(&cpr_lock);
+ CALLB_CPR_SAFE_BEGIN(&cpr_info);
+ mutex_exit(&cpr_lock);
+ sema_p(&segvn_trasync_sem);
+ mutex_enter(&cpr_lock);
+ CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
+ mutex_exit(&cpr_lock);
+ segvn_trupdate();
+ }
+}
+
+static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0;
+
+static void
+segvn_trupdate_wakeup(void *dummy)
+{
+ uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations();
+
+ if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) {
+ segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs;
+ sema_v(&segvn_trasync_sem);
+ }
+
+ if (!segvn_disable_textrepl_update &&
+ segvn_update_textrepl_interval != 0) {
+ (void) timeout(segvn_trupdate_wakeup, dummy,
+ segvn_update_textrepl_interval);
+ }
+}
+
+static void
+segvn_trupdate(void)
+{
+ ulong_t hash;
+ svntr_t *svntrp;
+ segvn_data_t *svd;
+
+ ASSERT(svntr_hashtab != NULL);
+
+ for (hash = 0; hash < svntr_hashtab_sz; hash++) {
+ mutex_enter(&svntr_hashtab[hash].tr_lock);
+ svntrp = svntr_hashtab[hash].tr_head;
+ for (; svntrp != NULL; svntrp = svntrp->tr_next) {
+ ASSERT(svntrp->tr_refcnt != 0);
+ svd = svntrp->tr_svnhead;
+ for (; svd != NULL; svd = svd->svn_trnext) {
+ segvn_trupdate_seg(svd->seg, svd, svntrp,
+ hash);
+ }
+ }
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ }
+}
+
+static void
+segvn_trupdate_seg(struct seg *seg,
+ segvn_data_t *svd,
+ svntr_t *svntrp,
+ ulong_t hash)
+{
+ proc_t *p;
+ lgrp_id_t lgrp_id;
+ struct as *as;
+ size_t size;
+ struct anon_map *amp;
+
+ ASSERT(svd->vp != NULL);
+ ASSERT(svd->vp == svntrp->tr_vp);
+ ASSERT(svd->offset == svntrp->tr_off);
+ ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff);
+ ASSERT(seg != NULL);
+ ASSERT(svd->seg == seg);
+ ASSERT(seg->s_data == (void *)svd);
+ ASSERT(seg->s_szc == svntrp->tr_szc);
+ ASSERT(svd->tr_state == SEGVN_TR_ON);
+ ASSERT(svd->amp != NULL);
+ ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
+ ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE);
+ ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX);
+ ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp);
+ ASSERT(svntrp->tr_refcnt != 0);
+ ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock));
+
+ as = seg->s_as;
+ ASSERT(as != NULL && as != &kas);
+ p = as->a_proc;
+ ASSERT(p != NULL);
+ ASSERT(p->p_tr_lgrpid != LGRP_NONE);
+ lgrp_id = p->p_t1_lgrpid;
+ if (lgrp_id == LGRP_NONE) {
+ return;
+ }
+ ASSERT(lgrp_id < NLGRPS_MAX);
+ if (svd->tr_policy_info.mem_lgrpid == lgrp_id) {
+ return;
+ }
+
+ /*
+ * Use tryenter locking since we are locking as/seg and svntr hash
+ * lock in reverse from syncrounous thread order.
+ */
+ if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) {
+ SEGVN_TR_ADDSTAT(nolock);
+ if (segvn_lgrp_trthr_migrs_snpsht) {
+ segvn_lgrp_trthr_migrs_snpsht = 0;
+ }
+ return;
+ }
+ if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) {
+ AS_LOCK_EXIT(as, &as->a_lock);
+ SEGVN_TR_ADDSTAT(nolock);
+ if (segvn_lgrp_trthr_migrs_snpsht) {
+ segvn_lgrp_trthr_migrs_snpsht = 0;
+ }
+ return;
+ }
+ size = seg->s_size;
+ if (svntrp->tr_amp[lgrp_id] == NULL) {
+ size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
+ if (trmem > segvn_textrepl_max_bytes) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ atomic_add_long(&segvn_textrepl_bytes, -size);
+ SEGVN_TR_ADDSTAT(normem);
+ return;
+ }
+ if (anon_try_resv_zone(size, NULL) == 0) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ atomic_add_long(&segvn_textrepl_bytes, -size);
+ SEGVN_TR_ADDSTAT(noanon);
+ return;
+ }
+ amp = anonmap_alloc(size, size, KM_NOSLEEP);
+ if (amp == NULL) {
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ atomic_add_long(&segvn_textrepl_bytes, -size);
+ anon_unresv_zone(size, NULL);
+ SEGVN_TR_ADDSTAT(nokmem);
+ return;
+ }
+ ASSERT(amp->refcnt == 1);
+ amp->a_szc = seg->s_szc;
+ svntrp->tr_amp[lgrp_id] = amp;
+ }
+ /*
+ * We don't need to drop the bucket lock but here we give other
+ * threads a chance. svntr and svd can't be unlinked as long as
+ * segment lock is held as a writer and AS held as well. After we
+ * retake bucket lock we'll continue from where we left. We'll be able
+ * to reach the end of either list since new entries are always added
+ * to the beginning of the lists.
+ */
+ mutex_exit(&svntr_hashtab[hash].tr_lock);
+ hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL);
+ mutex_enter(&svntr_hashtab[hash].tr_lock);
+
+ ASSERT(svd->tr_state == SEGVN_TR_ON);
+ ASSERT(svd->amp != NULL);
+ ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
+ ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id);
+ ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]);
+
+ svd->tr_policy_info.mem_lgrpid = lgrp_id;
+ svd->amp = svntrp->tr_amp[lgrp_id];
+ p->p_tr_lgrpid = NLGRPS_MAX;
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ ASSERT(svntrp->tr_refcnt != 0);
+ ASSERT(svd->vp == svntrp->tr_vp);
+ ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id);
+ ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]);
+ ASSERT(svd->seg == seg);
+ ASSERT(svd->tr_state == SEGVN_TR_ON);
+
+ SEGVN_TR_ADDSTAT(asyncrepl);
+}
diff --git a/usr/src/uts/common/vm/seg_vn.h b/usr/src/uts/common/vm/seg_vn.h
index 1ef18ee142..d8c8be8ff4 100644
--- a/usr/src/uts/common/vm/seg_vn.h
+++ b/usr/src/uts/common/vm/seg_vn.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -103,16 +103,29 @@ typedef struct segvn_data {
ushort_t flags; /* flags - from sys/mman.h */
ssize_t softlockcnt; /* # of pages SOFTLOCKED in seg */
lgrp_mem_policy_info_t policy_info; /* memory allocation policy */
+ lgrp_mem_policy_info_t tr_policy_info; /* memory allocation for TR */
+ struct seg *seg; /* pointer back to seg */
+ struct segvn_data *svn_trnext; /* textrepl list next link */
+ struct segvn_data *svn_trprev; /* textrepl list prev link */
+ int tr_state; /* TR (text replication) state */
} segvn_data_t;
#ifdef _KERNEL
/*
+ * segment text replication states.
+ */
+#define SEGVN_TR_INIT (0) /* Check if text replication can be enabled */
+#define SEGVN_TR_ON (1) /* Text replication is enabled */
+#define SEGVN_TR_OFF (2) /* Text replication is disabled */
+
+/*
* Macros for segvn segment driver locking.
*/
#define SEGVN_LOCK_ENTER(as, lock, type) rw_enter((lock), (type))
#define SEGVN_LOCK_EXIT(as, lock) rw_exit((lock))
#define SEGVN_LOCK_DOWNGRADE(as, lock) rw_downgrade((lock))
+#define SEGVN_LOCK_TRYENTER(as, lock, type) rw_tryenter((lock), (type))
/*
* Macros to test lock states.
@@ -151,6 +164,50 @@ typedef struct segvn_data {
((struct segvn_crargs *)(argsp))->szc == AS_MAP_STACK) && \
((struct segvn_crargs *)(argsp))->vp == NULL)
+#define SVNTR_HASH_FUNC(vp) (((((uintptr_t)(vp)) >> 4) ^ \
+ (((uintptr_t)(vp)) >> 11)) & \
+ (svntr_hashtab_sz - 1))
+
+#define SEGVN_TR_ADDSTAT(stat) \
+ segvn_textrepl_stats[CPU->cpu_id].tr_stat_##stat++
+
+/*
+ * A hash table entry looked up by vnode, off/eoff and szc to find anon map to
+ * use for text replication based on main thread's (t_tid = 1) lgrp.
+ */
+typedef struct svntr {
+ struct vnode *tr_vp; /* text file vnode */
+ u_offset_t tr_off; /* tr_vp mapping start offset */
+ size_t tr_eoff; /* tr_vp mapping end offset */
+ uint_t tr_szc; /* tr_vp mapping pagesize */
+ int tr_valid; /* entry validity state */
+ struct svntr *tr_next; /* next svntr in this hash bucket */
+ timestruc_t tr_mtime; /* tr_vp modification time */
+ ulong_t tr_refcnt; /* number of segs sharing this entry */
+ segvn_data_t *tr_svnhead; /* list of segs sharing this entry */
+ struct anon_map *tr_amp[NLGRPS_MAX]; /* per lgrp anon maps */
+} svntr_t;
+
+typedef struct svntr_bucket {
+ svntr_t *tr_head; /* first svntr in this hash bucket */
+ kmutex_t tr_lock; /* per bucket lock */
+} svntr_bucket_t;
+
+typedef struct svntr_stats {
+ ulong_t tr_stat_gaerr; /* VOP_GETATTR() failures */
+ ulong_t tr_stat_overmap; /* no TR due to beyond EOF mappings */
+ ulong_t tr_stat_wrcnt; /* no TR due to writtable mappings */
+ ulong_t tr_stat_stale; /* TR entry is stale */
+ ulong_t tr_stat_overlap; /* overlap with other mappings */
+ ulong_t tr_stat_nokmem; /* no TR due to kmem alloc failures */
+ ulong_t tr_stat_noanon; /* no TR due to no swap space */
+ ulong_t tr_stat_normem; /* no TR due to no repl memory */
+ ulong_t tr_stat_nolock; /* async TR failure due to locks */
+ ulong_t tr_stat_asyncrepl; /* number of async TRs */
+ ulong_t tr_stat_repl; /* number of sync TRs */
+ ulong_t tr_stat_newamp; /* number of new amp allocs for TR */
+} svntr_stats_t;
+
extern void segvn_init(void);
extern int segvn_create(struct seg *, void *);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index 2d90f568ad..bac665d20a 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -730,7 +730,7 @@ set_anoninfo(void)
* Return non-zero on success.
*/
int
-anon_resvmem(size_t size, boolean_t takemem, zone_t *zone)
+anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
{
pgcnt_t npages = btopr(size);
pgcnt_t mswap_pages = 0;
@@ -803,10 +803,12 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone)
* swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
*
*/
- mutex_exit(&anoninfo_lock);
- (void) page_reclaim_mem(mswap_pages,
- swapfs_minfree + swapfs_reserve, 0);
- mutex_enter(&anoninfo_lock);
+ if (tryhard) {
+ mutex_exit(&anoninfo_lock);
+ (void) page_reclaim_mem(mswap_pages,
+ swapfs_minfree + swapfs_reserve, 0);
+ mutex_enter(&anoninfo_lock);
+ }
mutex_enter(&freemem_lock);
if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
@@ -1813,6 +1815,7 @@ anon_map_getpages(
enum seg_rw rw,
int brkcow,
int anypgsz,
+ int pgflags,
struct cred *cred)
{
pgcnt_t pgcnt;
@@ -1906,7 +1909,7 @@ top:
if (prealloc) {
ASSERT(conpp == NULL);
if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
- szc, 0) != 0) {
+ szc, 0, pgflags) != 0) {
VM_STAT_ADD(anonvmstats.getpages[7]);
if (brkcow == 0 ||
!anon_share(amp->ahp, start_idx, pgcnt)) {
@@ -1962,7 +1965,7 @@ top:
VM_STAT_ADD(anonvmstats.getpages[9]);
*protp = PROT_ALL;
return (anon_map_privatepages(amp, start_idx, szc, seg,
- addr, prot, ppa, vpage, anypgsz, cred));
+ addr, prot, ppa, vpage, anypgsz, pgflags, cred));
}
}
@@ -2144,7 +2147,7 @@ top:
*protp = PROT_ALL;
return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
- ppa, vpage, anypgsz, cred));
+ ppa, vpage, anypgsz, pgflags, cred));
io_err:
/*
* We got an IO error somewhere in our large page.
@@ -2376,6 +2379,7 @@ anon_map_privatepages(
page_t *ppa[],
struct vpage vpage[],
int anypgsz,
+ int pgflags,
struct cred *cred)
{
pgcnt_t pgcnt;
@@ -2420,7 +2424,7 @@ anon_map_privatepages(
VM_STAT_ADD(anonvmstats.privatepages[2]);
prealloc = 0;
} else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
- anypgsz) != 0) {
+ anypgsz, pgflags) != 0) {
VM_STAT_ADD(anonvmstats.privatepages[3]);
prealloc = 0;
}
@@ -3076,7 +3080,7 @@ top:
}
err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
- vpage, -1, cred);
+ vpage, -1, 0, cred);
if (err > 0) {
VM_STAT_ADD(anonvmstats.demotepages[5]);
kmem_free(ppa, ppasize);
@@ -3180,16 +3184,25 @@ anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
* associating the given swap reservation with the new anon_map.
*/
struct anon_map *
-anonmap_alloc(size_t size, size_t swresv)
+anonmap_alloc(size_t size, size_t swresv, int flags)
{
struct anon_map *amp;
+ int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
- amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP);
+ amp = kmem_cache_alloc(anonmap_cache, kmflags);
+ if (amp == NULL) {
+ ASSERT(kmflags == KM_NOSLEEP);
+ return (NULL);
+ }
+ amp->ahp = anon_create(btopr(size), flags);
+ if (amp->ahp == NULL) {
+ ASSERT(flags == ANON_NOSLEEP);
+ kmem_cache_free(anonmap_cache, amp);
+ return (NULL);
+ }
amp->refcnt = 1;
amp->size = size;
-
- amp->ahp = anon_create(btopr(size), ANON_SLEEP);
amp->swresv = swresv;
amp->locality = 0;
amp->a_szc = 0;
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 9a43937729..e28e2aaa4c 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -1575,6 +1575,7 @@ as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
struct vattr va;
u_offset_t eoff;
size_t save_size = 0;
+ extern size_t textrepl_size_thresh;
ASSERT(AS_WRITE_HELD(as, &as->a_lock));
ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
@@ -1621,6 +1622,9 @@ again:
}
}
+ if (size > textrepl_size_thresh) {
+ vn_a->flags |= _MAP_TEXTREPL;
+ }
error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
segcreated);
if (error != 0) {
diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c
index 489f091acf..ab7581fb36 100644
--- a/usr/src/uts/common/vm/vm_page.c
+++ b/usr/src/uts/common/vm/vm_page.c
@@ -1972,7 +1972,7 @@ uint32_t pg_alloc_pgs_mtbf = 0;
*/
int
page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
- page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz)
+ page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
{
pgcnt_t npgs, curnpgs, totpgs;
size_t pgsz;
@@ -1981,6 +1981,7 @@ page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
lgrp_t *lgrp;
ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
+ ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
VM_STAT_ADD(alloc_pages[0]);
@@ -2005,7 +2006,17 @@ page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
while (npgs && szc) {
lgrp = lgrp_mem_choose(seg, addr, pgsz);
- pp = page_get_freelist(vp, 0, seg, addr, pgsz, 0, lgrp);
+ if (pgflags == PG_LOCAL) {
+ pp = page_get_freelist(vp, 0, seg, addr, pgsz,
+ pgflags, lgrp);
+ if (pp == NULL) {
+ pp = page_get_freelist(vp, 0, seg, addr, pgsz,
+ 0, lgrp);
+ }
+ } else {
+ pp = page_get_freelist(vp, 0, seg, addr, pgsz,
+ 0, lgrp);
+ }
if (pp != NULL) {
VM_STAT_ADD(alloc_pages[1]);
page_list_concat(&pplist, &pp);
diff --git a/usr/src/uts/common/vm/vm_pagelist.c b/usr/src/uts/common/vm/vm_pagelist.c
index 2eeb9485da..cef95452bf 100644
--- a/usr/src/uts/common/vm/vm_pagelist.c
+++ b/usr/src/uts/common/vm/vm_pagelist.c
@@ -3555,25 +3555,26 @@ pgretry:
lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
- /*
- * Try to get a non-local freelist page.
- */
- LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
- while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
- pp = page_get_func(mnode, bin, mtype, szc, flags);
- if (pp != NULL) {
- DTRACE_PROBE4(page__get,
- lgrp_t *, lgrp,
- int, mnode,
- ulong_t, bin,
- uint_t, flags);
- VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
- return (pp);
+ if (!(flags & PG_LOCAL)) {
+ /*
+ * Try to get a non-local freelist page.
+ */
+ LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
+ while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
+ pp = page_get_func(mnode, bin, mtype, szc, flags);
+ if (pp != NULL) {
+ DTRACE_PROBE4(page__get,
+ lgrp_t *, lgrp,
+ int, mnode,
+ ulong_t, bin,
+ uint_t, flags);
+ VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
+ return (pp);
+ }
}
+ ASSERT(pp == NULL);
}
- ASSERT(pp == NULL);
-
/*
* when the cage is off chances are page_get_contig_pages() will fail
* to lock a large page chunk therefore when the cage is off it's not
@@ -3591,7 +3592,8 @@ pgretry:
goto pgretry;
}
- if (pgcplimitsearch && page_get_func == page_get_contig_pages)
+ if (!(flags & PG_LOCAL) && pgcplimitsearch &&
+ page_get_func == page_get_contig_pages)
SETPGCPFAILCNT(szc);
VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index f25cf4c9ae..e10b0aede9 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -1110,11 +1110,20 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
if (svd->amp->swresv == 0)
incore = B_TRUE;
}
- if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
+ SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
+ /*
+ * Text replication anon maps can be shared across all zones.
+ * Space used for text replication is typically capped as
+ * small % of memory. To keep it simple for now we don't
+ * account for swap and memory space used for text replication.
+ */
+ if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL &&
+ svd->type == MAP_PRIVATE) {
private_amp = svd->amp;
p_start = svd->anon_index;
p_end = svd->anon_index + btop(seg->s_size) - 1;
}
+ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
} else if (seg->s_ops == &segspt_shmops) {
shared = B_TRUE;
shmd = (struct shm_data *)seg->s_data;
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index f946ed2f5a..5232cc4b93 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -215,6 +215,8 @@ mlsetup(struct regs *rp)
p0.p_as = &kas;
p0.p_lockp = &p0lock;
p0.p_brkpageszc = 0;
+ p0.p_t1_lgrpid = LGRP_NONE;
+ p0.p_tr_lgrpid = LGRP_NONE;
sigorset(&p0.p_ignore, &ignoredefault);
CPU->cpu_thread = &t0;
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 1e0dae7183..9cdaa19dfb 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -508,6 +508,8 @@ int l2cache_sz = 0x80000;
int l2cache_linesz = 0x40;
int l2cache_assoc = 1;
+static size_t textrepl_min_gb = 10;
+
/*
* on 64 bit we use a predifined VA range for mapping devices in the kernel
* on 32 bit the mappings are intermixed in the heap, so we use a bit map
@@ -832,6 +834,7 @@ startup_memlist(void)
caddr_t page_ctrs_mem;
size_t page_ctrs_size;
struct memlist *current;
+ extern size_t textrepl_size_thresh;
extern void startup_build_mem_nodes(struct memlist *);
/* XX64 fix these - they should be in include files */
@@ -1076,6 +1079,11 @@ startup_memlist(void)
PRM_POINT("startup_memlist() done");
PRM_DEBUG(valloc_sz);
+
+ if ((availrmem >> (30 - MMU_PAGESHIFT)) >= textrepl_min_gb &&
+ l2cache_sz <= 2 << 20) {
+ textrepl_size_thresh = (16 << 20) - 1;
+ }
}
/*
diff --git a/usr/src/uts/sun4/os/mlsetup.c b/usr/src/uts/sun4/os/mlsetup.c
index 227127092c..af8ab0ac17 100644
--- a/usr/src/uts/sun4/os/mlsetup.c
+++ b/usr/src/uts/sun4/os/mlsetup.c
@@ -178,6 +178,8 @@ mlsetup(struct regs *rp, void *cif, kfpu_t *fp)
p0.p_lockp = &p0lock;
p0.p_utraps = NULL;
p0.p_brkpageszc = 0;
+ p0.p_t1_lgrpid = LGRP_NONE;
+ p0.p_tr_lgrpid = LGRP_NONE;
sigorset(&p0.p_ignore, &ignoredefault);
CPU->cpu_thread = &t0;