summaryrefslogtreecommitdiff
path: root/usr/src/uts/common
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r--usr/src/uts/common/Makefile.files9
-rw-r--r--usr/src/uts/common/Makefile.rules22
-rw-r--r--usr/src/uts/common/conf/param.c12
-rw-r--r--usr/src/uts/common/crypto/api/kcf_random.c4
-rw-r--r--usr/src/uts/common/crypto/core/kcf_sched.c6
-rw-r--r--usr/src/uts/common/ctf/mapfile1
-rw-r--r--usr/src/uts/common/disp/cpucaps.c229
-rw-r--r--usr/src/uts/common/disp/disp.c24
-rw-r--r--usr/src/uts/common/disp/fss.c285
-rw-r--r--usr/src/uts/common/disp/thread.c16
-rw-r--r--usr/src/uts/common/dtrace/dtrace.c46
-rw-r--r--usr/src/uts/common/fs/dev/sdev_vnops.c18
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c640
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c154
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c625
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c1375
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_subr.c515
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vfsops.c367
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vnops.c3077
-rw-r--r--usr/src/uts/common/fs/lxproc/lxproc.h275
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c4
-rw-r--r--usr/src/uts/common/fs/proc/prcontrol.c6
-rw-r--r--usr/src/uts/common/fs/proc/prdata.h5
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c13
-rw-r--r--usr/src/uts/common/fs/swapfs/swap_subr.c6
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vfsops.c20
-rw-r--r--usr/src/uts/common/fs/vfs.c2
-rw-r--r--usr/src/uts/common/fs/vnode.c110
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c23
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c14
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_disk.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_raidz.h49
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_zone.h62
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h6
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c131
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c58
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c31
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c1179
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c11
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c3
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c62
-rw-r--r--usr/src/uts/common/inet/ip.h12
-rw-r--r--usr/src/uts/common/inet/ip/ip_attr.c112
-rw-r--r--usr/src/uts/common/inet/ip/ip_dce.c123
-rw-r--r--usr/src/uts/common/inet/ip/ip_tunables.c6
-rw-r--r--usr/src/uts/common/inet/ip/ipsecesp.c3
-rw-r--r--usr/src/uts/common/inet/ip_stack.h1
-rw-r--r--usr/src/uts/common/inet/ipd/ipd.c1226
-rw-r--r--usr/src/uts/common/inet/ipd/ipd.conf27
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c8
-rw-r--r--usr/src/uts/common/inet/squeue.c24
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c6
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_stats.c20
-rw-r--r--usr/src/uts/common/inet/tcp_stack.h3
-rw-r--r--usr/src/uts/common/io/dld/dld_drv.c6
-rw-r--r--usr/src/uts/common/io/dls/dls.c20
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c156
-rw-r--r--usr/src/uts/common/io/dls/dls_stat.c172
-rw-r--r--usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_eth.c2
-rw-r--r--usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_lv.c4
-rw-r--r--usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c3
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_82599.c6
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_api.c6
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_type.h46
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c7
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c102
-rw-r--r--usr/src/uts/common/io/scsi/targets/sd.c10
-rw-r--r--usr/src/uts/common/os/bio.c10
-rw-r--r--usr/src/uts/common/os/clock.c6
-rw-r--r--usr/src/uts/common/os/clock_highres.c36
-rw-r--r--usr/src/uts/common/os/contract.c50
-rw-r--r--usr/src/uts/common/os/core.c8
-rw-r--r--usr/src/uts/common/os/cred.c8
-rw-r--r--usr/src/uts/common/os/cyclic.c6
-rw-r--r--usr/src/uts/common/os/exit.c15
-rw-r--r--usr/src/uts/common/os/kstat_fr.c18
-rw-r--r--usr/src/uts/common/os/logsubr.c3
-rw-r--r--usr/src/uts/common/os/msacct.c36
-rw-r--r--usr/src/uts/common/os/policy.c10
-rw-r--r--usr/src/uts/common/os/priv_defs4
-rw-r--r--usr/src/uts/common/os/vmem.c2
-rw-r--r--usr/src/uts/common/os/zone.c789
-rw-r--r--usr/src/uts/common/sys/Makefile4
-rw-r--r--usr/src/uts/common/sys/buf.h8
-rw-r--r--usr/src/uts/common/sys/cpucaps.h5
-rw-r--r--usr/src/uts/common/sys/cpucaps_impl.h6
-rw-r--r--usr/src/uts/common/sys/cred.h1
-rw-r--r--usr/src/uts/common/sys/ctf_api.h1
-rw-r--r--usr/src/uts/common/sys/dktp/dadk.h2
-rw-r--r--usr/src/uts/common/sys/dld.h3
-rw-r--r--usr/src/uts/common/sys/dls.h5
-rw-r--r--usr/src/uts/common/sys/dls_impl.h4
-rw-r--r--usr/src/uts/common/sys/dls_mgmt.h2
-rw-r--r--usr/src/uts/common/sys/dtrace.h3
-rw-r--r--usr/src/uts/common/sys/fs/hyprlofs.h91
-rw-r--r--usr/src/uts/common/sys/fs/hyprlofs_info.h189
-rw-r--r--usr/src/uts/common/sys/fss.h4
-rw-r--r--usr/src/uts/common/sys/ipd.h82
-rw-r--r--usr/src/uts/common/sys/ipmi.h176
-rw-r--r--usr/src/uts/common/sys/mman.h2
-rw-r--r--usr/src/uts/common/sys/mntent.h2
-rw-r--r--usr/src/uts/common/sys/policy.h2
-rw-r--r--usr/src/uts/common/sys/procfs.h4
-rw-r--r--usr/src/uts/common/sys/thread.h2
-rw-r--r--usr/src/uts/common/sys/uadmin.h3
-rw-r--r--usr/src/uts/common/sys/vm_usage.h4
-rw-r--r--usr/src/uts/common/sys/zone.h180
-rw-r--r--usr/src/uts/common/syscall/getloadavg.c14
-rw-r--r--usr/src/uts/common/syscall/memcntl.c9
-rw-r--r--usr/src/uts/common/syscall/sysconfig.c24
-rw-r--r--usr/src/uts/common/syscall/uadmin.c7
-rw-r--r--usr/src/uts/common/vm/hat.h2
-rw-r--r--usr/src/uts/common/vm/seg_vn.c12
-rw-r--r--usr/src/uts/common/vm/vm_anon.c2
-rw-r--r--usr/src/uts/common/vm/vm_as.c20
-rw-r--r--usr/src/uts/common/vm/vm_pvn.c28
-rw-r--r--usr/src/uts/common/vm/vm_usage.c198
126 files changed, 13256 insertions, 523 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 13770698ee..a76edeca13 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -25,6 +25,7 @@
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
#
#
@@ -1146,8 +1147,13 @@ PIPE_OBJS += pipe.o
HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \
hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o
+HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \
+ hyprlofs_vnops.o hyprlofs_vfsops.o
+
LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o
+LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o
+
NAMEFS_OBJS += namevfs.o namevno.o
NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \
@@ -1384,6 +1390,7 @@ ZFS_COMMON_OBJS += \
zfs_fuid.o \
zfs_sa.o \
zfs_znode.o \
+ zfs_zone.o \
zil.o \
zio.o \
zio_checksum.o \
@@ -1737,6 +1744,8 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \
ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \
ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o
+IPD_OBJS += ipd.o
+
IBD_OBJS += ibd.o ibd_cm.o
EIBNX_OBJS += enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 2b379caa7b..efc0988007 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -25,6 +25,7 @@
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
#
#
@@ -245,10 +246,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -508,6 +517,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipf/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipd/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(COMMONBASE)/net/patricia/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1739,9 +1752,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -1886,6 +1905,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/iptun/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipd/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipf/%.c
@($(LHEAD) $(LINT.c) $(IPFFLAGS) $< $(LTAIL))
diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c
index d72cfb0b8f..06e7810f07 100644
--- a/usr/src/uts/common/conf/param.c
+++ b/usr/src/uts/common/conf/param.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
*/
@@ -565,8 +566,8 @@ char *isa_list = architecture;
static pgcnt_t original_physmem = 0;
#define MIN_DEFAULT_MAXUSERS 8u
-#define MAX_DEFAULT_MAXUSERS 2048u
-#define MAX_MAXUSERS 4096u
+#define MAX_DEFAULT_MAXUSERS 10000u
+#define MAX_MAXUSERS 20000u
void
param_preset(void)
@@ -578,7 +579,7 @@ void
param_calc(int platform_max_nprocs)
{
/*
- * Default to about one "user" per megabyte, taking into
+ * Default to about one "user" per 8MB, taking into
* account both physical and virtual constraints.
* Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
* converts pages to megs without integer overflow.
@@ -592,8 +593,9 @@ param_calc(int platform_max_nprocs)
if (maxusers == 0) {
pgcnt_t physmegs = physmem >> (20 - PAGESHIFT);
pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20;
- maxusers = MIN(MAX(MIN(physmegs, virtmegs),
- MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS);
+ maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */
+ maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS);
+ maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS);
}
if (maxusers > MAX_MAXUSERS) {
maxusers = MAX_MAXUSERS;
diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c
index efaf5c37d1..a11098326b 100644
--- a/usr/src/uts/common/crypto/api/kcf_random.c
+++ b/usr/src/uts/common/crypto/api/kcf_random.c
@@ -71,6 +71,7 @@
#include <sys/cpuvar.h>
#include <sys/taskq.h>
#include <rng/fips_random.h>
+#include <sys/strlog.h>
#define RNDPOOLSIZE 1024 /* Pool size in bytes */
#define MINEXTRACTBYTES 20
@@ -900,7 +901,8 @@ rnd_handler(void *arg)
int len = 0;
if (!rng_prov_found && rng_ok_to_log) {
- cmn_err(CE_WARN, "No randomness provider enabled for "
+ (void) strlog(0, 0, 0, SL_NOTE,
+ "No randomness provider enabled for "
"/dev/random. Use cryptoadm(1M) to enable a provider.");
rng_ok_to_log = B_FALSE;
}
diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c
index f461fe048c..8b2760b237 100644
--- a/usr/src/uts/common/crypto/core/kcf_sched.c
+++ b/usr/src/uts/common/crypto/core/kcf_sched.c
@@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg)
case 0:
case -1:
/*
- * Woke up with no work to do. Check
- * if this thread should exit. We keep
- * at least kcf_minthreads.
+ * Woke up with no work to do. Check if we
+ * should lwp_exit() (which won't return). We
+ * keep at least kcf_minthreads.
*/
if (kcfpool->kp_threads > kcf_minthreads) {
KCF_ATOMIC_DECR(kcfpool->kp_threads);
diff --git a/usr/src/uts/common/ctf/mapfile b/usr/src/uts/common/ctf/mapfile
index ca83165803..40f300686e 100644
--- a/usr/src/uts/common/ctf/mapfile
+++ b/usr/src/uts/common/ctf/mapfile
@@ -82,6 +82,7 @@ SYMBOL_SCOPE {
ctf_open;
ctf_parent_file;
ctf_parent_name;
+ ctf_parent_label;
ctf_setmodel;
ctf_setspecific;
ctf_set_array;
diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c
index 46f53faab6..26067235a9 100644
--- a/usr/src/uts/common/disp/cpucaps.c
+++ b/usr/src/uts/common/disp/cpucaps.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011, 2012 Joyent, Inc. All rights reserved.
*/
#include <sys/disp.h>
@@ -74,6 +75,32 @@
* Putting threads on wait queues in random places while running in the
* kernel might lead to all kinds of locking problems.
*
+ * Bursting
+ * ========
+ *
+ * CPU bursting occurs when the CPU usage is over the baseline but under the
+ * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant
+ * environment so that we know how much CPU is allocated for a tenant under
+ * normal utilization. We can then track how much time a zone is spending
+ * over the "normal" CPU utilization expected for that zone using the
+ * "above_base_sec" kstat. This kstat is cumulative.
+ *
+ * If the zone has a burst limit (zone.cpu-burst-time) then the zone can
+ * burst for that period of time (in seconds) before the effective cap is
+ * lowered to the baseline. Once the effective cap is lowered, the zone
+ * will run at the baseline for the burst limit before the effective cap is
+ * raised again to the full value. This will allow the zone to burst again.
+ * We can watch this behavior using the kstats. The "effective" kstat shows
+ * which cap is being used, the baseline value or the burst value. The
+ * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the
+ * "bursting_sec" kstat shows how many seconds the zone has currently been
+ * bursting. When the CPU load is continuously greater than the baseline,
+ * bursting_sec will increase, up to the burst_limit_sec value, then the
+ * effective kstat will drop to the baseline and the bursting_sec value will
+ * decrease until it hits 0, at which time the effective kstat will return to
+ * the full burst value and the bursting_sec value will begin to increase
+ * again.
+ *
* Accounting
* ==========
*
@@ -203,18 +230,28 @@ static void caps_update();
*/
struct cap_kstat {
kstat_named_t cap_value;
+ kstat_named_t cap_baseline;
+ kstat_named_t cap_effective;
+ kstat_named_t cap_burst_limit;
+ kstat_named_t cap_bursting;
kstat_named_t cap_usage;
kstat_named_t cap_nwait;
kstat_named_t cap_below;
kstat_named_t cap_above;
+ kstat_named_t cap_above_base;
kstat_named_t cap_maxusage;
kstat_named_t cap_zonename;
} cap_kstat = {
{ "value", KSTAT_DATA_UINT64 },
+ { "baseline", KSTAT_DATA_UINT64 },
+ { "effective", KSTAT_DATA_UINT64 },
+ { "burst_limit_sec", KSTAT_DATA_UINT64 },
+ { "bursting_sec", KSTAT_DATA_UINT64 },
{ "usage", KSTAT_DATA_UINT64 },
{ "nwait", KSTAT_DATA_UINT64 },
{ "below_sec", KSTAT_DATA_UINT64 },
{ "above_sec", KSTAT_DATA_UINT64 },
+ { "above_base_sec", KSTAT_DATA_UINT64 },
{ "maxusage", KSTAT_DATA_UINT64 },
{ "zonename", KSTAT_DATA_STRING },
};
@@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
cap->cap_below = cap->cap_above = 0;
cap->cap_maxusage = 0;
cap->cap_usage = 0;
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
waitq_unblock(&cap->cap_waitq);
if (CPUCAPS_OFF()) {
cpucaps_enabled = B_TRUE;
@@ -345,7 +382,7 @@ cap_disable(list_t *l, cpucap_t *cap)
cpucaps_enabled = B_FALSE;
cpucaps_clock_callout = NULL;
}
- cap->cap_value = 0;
+ cap->cap_value = cap->cap_chk_value = 0;
cap->cap_project = NULL;
cap->cap_zone = NULL;
if (cap->cap_kstat != NULL) {
@@ -487,6 +524,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
* The waitq_isempty check is performed without the waitq lock. If a new thread
* is placed on the waitq right after the check, it will be picked up during the
* next invocation of cap_poke_waitq().
+ *
+ * Called once per tick for zones.
*/
/* ARGSUSED */
static void
@@ -494,7 +533,45 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen)
{
ASSERT(MUTEX_HELD(&caps_lock));
- if (cap->cap_usage >= cap->cap_value) {
+ if (cap->cap_base != 0) {
+ /*
+ * Because of the way usage is calculated and decayed, its
+ * possible for the zone to be slightly over its cap, but we
+ * don't want to count that after we have reduced the effective
+ * cap to the baseline. That way the zone will be able to
+ * burst again after the burst_limit has expired.
+ */
+ if (cap->cap_usage > cap->cap_base &&
+ cap->cap_chk_value == cap->cap_value) {
+ cap->cap_above_base++;
+
+ /*
+ * If bursting is limited and we've been bursting
+ * longer than we're supposed to, then set the
+ * effective cap to the baseline.
+ */
+ if (cap->cap_burst_limit != 0) {
+ cap->cap_bursting++;
+ if (cap->cap_bursting >= cap->cap_burst_limit)
+ cap->cap_chk_value = cap->cap_base;
+ }
+ } else if (cap->cap_bursting > 0) {
+ /*
+ * We're not bursting now, but we were, decay the
+ * bursting timer.
+ */
+ cap->cap_bursting--;
+ /*
+ * Reset the effective cap once we decay to 0 so we
+ * can burst again.
+ */
+ if (cap->cap_bursting == 0 &&
+ cap->cap_chk_value != cap->cap_value)
+ cap->cap_chk_value = cap->cap_value;
+ }
+ }
+
+ if (cap->cap_usage >= cap->cap_chk_value) {
cap->cap_above++;
} else {
waitq_t *wq = &cap->cap_waitq;
@@ -629,14 +706,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
* Remove all projects in this zone without caps
* from the capped_projects list.
*/
- if (project_cap->cap_value == MAX_USAGE) {
+ if (project_cap->cap_chk_value == MAX_USAGE) {
cap_project_disable(kpj);
}
} else if (CAP_DISABLED(project_cap)) {
/*
* Add the project to capped_projects list.
*/
- ASSERT(project_cap->cap_value == 0);
+ ASSERT(project_cap->cap_chk_value == 0);
cap_project_enable(kpj, MAX_USAGE);
}
mutex_exit(&caps_lock);
@@ -746,7 +823,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
/*
* No state transitions, just change the value
*/
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
}
ASSERT(MUTEX_HELD(&caps_lock));
@@ -757,6 +834,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
}
/*
+ * Set zone's base cpu value to base_val
+ */
+int
+cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= MAXCAP);
+ if (base_val > MAXCAP)
+ base_val = MAXCAP;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = base_val * cap_tick_cost;
+ if (value < 0 || value > cap->cap_value)
+ value = 0;
+
+ cap->cap_base = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
+ * Set zone's maximum burst time in seconds. A burst time of 0 means that
+ * the zone can run over its baseline indefinitely.
+ */
+int
+cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val)
+{
+ cpucap_t *cap = NULL;
+ hrtime_t value;
+
+ ASSERT(base_val <= INT_MAX);
+ /* Treat the default as 0 - no limit */
+ if (base_val == INT_MAX)
+ base_val = 0;
+ if (base_val > INT_MAX)
+ base_val = INT_MAX;
+
+ if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone))
+ return (0);
+
+ if (zone->zone_cpucap == NULL)
+ cap = cap_alloc();
+
+ mutex_enter(&caps_lock);
+
+ if (cpucaps_busy) {
+ mutex_exit(&caps_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+ * held. If it is still NULL, assign a newly allocated cpucap to it.
+ */
+ if (zone->zone_cpucap == NULL) {
+ zone->zone_cpucap = cap;
+ } else if (cap != NULL) {
+ cap_free(cap);
+ }
+
+ cap = zone->zone_cpucap;
+
+ value = SEC_TO_TICK(base_val);
+ if (value < 0)
+ value = 0;
+
+ cap->cap_burst_limit = value;
+
+ mutex_exit(&caps_lock);
+
+ return (0);
+}
+
+/*
* The project is going away so disable its cap.
*/
void
@@ -902,7 +1081,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
if (CAP_DISABLED(cap))
cap_project_enable(kpj, value);
else
- cap->cap_value = value;
+ cap->cap_value = cap->cap_chk_value = value;
} else if (CAP_ENABLED(cap)) {
/*
* User requested to drop a cap on the project. If it is part of
@@ -910,7 +1089,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
* otherwise disable the cap.
*/
if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
- cap->cap_value = MAX_USAGE;
+ cap->cap_value = cap->cap_chk_value = MAX_USAGE;
} else {
cap_project_disable(kpj);
}
@@ -948,6 +1127,26 @@ cpucaps_zone_get(zone_t *zone)
}
/*
+ * Get current zone baseline.
+ */
+rctl_qty_t
+cpucaps_zone_get_base(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current zone maximum burst time.
+ */
+rctl_qty_t
+cpucaps_zone_get_burst_time(zone_t *zone)
+{
+ return (zone->zone_cpucap != NULL ?
+ (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0);
+}
+
+/*
* Charge project of thread t the time thread t spent on CPU since previously
* adjusted.
*
@@ -1045,7 +1244,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
project_cap = kpj->kpj_cpucap;
- if (project_cap->cap_usage >= project_cap->cap_value) {
+ if (project_cap->cap_usage >= project_cap->cap_chk_value) {
t->t_schedflag |= TS_PROJWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_PROJWAITQ) {
@@ -1059,7 +1258,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
} else {
cpucap_t *zone_cap = zone->zone_cpucap;
- if (zone_cap->cap_usage >= zone_cap->cap_value) {
+ if (zone_cap->cap_usage >= zone_cap->cap_chk_value) {
t->t_schedflag |= TS_ZONEWAITQ;
rc = B_TRUE;
} else if (t->t_schedflag & TS_ZONEWAITQ) {
@@ -1133,6 +1332,12 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_value.value.ui64 =
ROUND_SCALE(cap->cap_value, cap_tick_cost);
+ capsp->cap_baseline.value.ui64 =
+ ROUND_SCALE(cap->cap_base, cap_tick_cost);
+ capsp->cap_effective.value.ui64 =
+ ROUND_SCALE(cap->cap_chk_value, cap_tick_cost);
+ capsp->cap_burst_limit.value.ui64 =
+ ROUND_SCALE(cap->cap_burst_limit, tick_sec);
capsp->cap_usage.value.ui64 =
ROUND_SCALE(cap->cap_usage, cap_tick_cost);
capsp->cap_maxusage.value.ui64 =
@@ -1140,6 +1345,10 @@ cap_kstat_update(kstat_t *ksp, int rw)
capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+ capsp->cap_above_base.value.ui64 =
+ ROUND_SCALE(cap->cap_above_base, tick_sec);
+ capsp->cap_bursting.value.ui64 =
+ ROUND_SCALE(cap->cap_bursting, tick_sec);
kstat_named_setstr(&capsp->cap_zonename, zonename);
return (0);
diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c
index be92ba108b..9afcd81239 100644
--- a/usr/src/uts/common/disp/disp.c
+++ b/usr/src/uts/common/disp/disp.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
@@ -105,7 +109,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri);
/*
* If this is set, only interrupt threads will cause kernel preemptions.
* This is done by changing the value of kpreemptpri. kpreemptpri
- * will either be the max sysclass pri + 1 or the min interrupt pri.
+ * will either be the max sysclass pri or the min interrupt pri.
*/
int only_intr_kpreempt;
@@ -252,7 +256,23 @@ dispinit(void)
maxglobpri = cl_maxglobpri;
}
}
- kpreemptpri = (pri_t)v.v_maxsyspri + 1;
+
+ /*
+ * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is
+ * to say, maxclsyspri + 1. However, over time, the system has used
+ * more and more asynchronous kernel threads, with an increasing number
+ * of these doing work on direct behalf of higher-level software (e.g.,
+ * network processing). This has led to potential priority inversions:
+ * threads doing low-priority lengthy kernel work can effectively
+ * delay kernel-level processing of higher-priority data. To minimize
+ * such inversions, we set kpreemptpri to be v_maxsyspri; anything in
+ * the kernel that runs at maxclsyspri will therefore induce kernel
+ * preemption, and this priority should be used if/when an asynchronous
+ * thread (or, as is often the case, task queue) is performing a task
+ * on behalf of higher-level software (or any task that is otherwise
+ * latency-sensitve).
+ */
+ kpreemptpri = (pri_t)v.v_maxsyspri;
if (kpqpri == KPQPRI)
kpqpri = kpreemptpri;
diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c
index 62301d65d8..1f9cdecb5c 100644
--- a/usr/src/uts/common/disp/fss.c
+++ b/usr/src/uts/common/disp/fss.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -54,6 +55,152 @@
#include <sys/cpucaps.h>
/*
+ * The fair share scheduling class ensures that collections of processes
+ * (zones and projects) each get their configured share of CPU. This is in
+ * contrast to the TS class which considers individual processes.
+ *
+ * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on
+ * projects using the project.cpu-shares rctl. By default the value is 1
+ * and it can range from 0 - 64k. A value of 0 means that processes in the
+ * collection will only get CPU resources when there are no other processes
+ * that need CPU. The cpu-share is used as one of the inputs to calculate a
+ * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls
+ * in the range 0-59. FSS calculates other, internal, priorities which are not
+ * visible outside of the FSS class.
+ *
+ * The FSS class should approximate TS behavior when there are excess CPU
+ * resources. When there is a backlog of runnable processes, then the share
+ * is used as input into the runnable process's priority calculation, where
+ * the final umdpri is used by the scheduler to determine when the process runs.
+ *
+ * Projects in a zone compete with each other for CPU time, receiving CPU
+ * allocation within a zone proportional to the project's share; at a higher
+ * level zones compete with each other, receiving allocation in a pset
+ * proportional to the zone's share.
+ *
+ * The FSS priority calculation consists of several parts.
+ *
+ * 1) Once per second the fss_update function runs. The first thing it does
+ * is call fss_decay_usage. This function updates the priorities of all
+ * projects with runnable threads, based on their shares and their usage.
+ * The priority is based on the project's normalized usage (shusage) value
+ * which is calculated this way:
+ *
+ * pset_shares^2 zone_int_shares^2
+ * usage * ------------- * ------------------
+ * kpj_shares^2 zone_ext_shares^2
+ *
+ * - usage - see below for more details
+ * - pset_shares is the total of all *active* shares in the pset (by default
+ * there is only one pset)
+ * - kpj_shares is the individual project's share (project.cpu-shares rctl)
+ * - zone_int_shares is the sum of shares of all active projects within the
+ * zone
+ * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl)
+ *
+ * The usage value (thought of as the share-usage, or shusage) is the recent
+ * CPU usage for all of the threads in the project and is calculated this
+ * way:
+ *
+ * (usage * FSS_DECAY_USG)
+ * usage = ------------------------- + ticks;
+ * FSS_DECAY_BASE
+ *
+ * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide
+ * - FSS_DECAY_USG is 96 - approximates 75% (96/128)
+ * - ticks is incremented whenever a process in this project is running
+ * when the scheduler's tick processing fires and is reset in
+ * fss_decay_usage every second.
+ *
+ * fss_decay_usage then decays the maxfsspri value for the pset. This
+ * value is used in the per-process priority calculation described in the
+ * next section. The maxfsspri is decayed using the following formula:
+ *
+ * maxfsspri * fss_nice_decay[NZERO])
+ * maxfsspri = ------------------------------------
+ * FSS_DECAY_BASE
+ *
+ *
+ * - NZERO is the default process priority (i.e. 20)
+ *
+ * The fss_nice_decay array is a fixed set of values used to adjust the
+ * decay rate of processes based on their nice value. Entries in this
+ * array are initialized in fss_init using the following formula:
+ *
+ * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i
+ * FSS_DECAY_MIN + -------------------------------------
+ * FSS_NICE_RANGE - 1
+ *
+ * - FSS_DECAY_MIN is 82 = approximates 65% (82/128)
+ * - FSS_DECAY_MAX is 108 = approximates 85% (108/128)
+ * - FSS_NICE_RANGE is 40 (range is 0 - 39)
+ *
+ * 2) The fss_update function uses the project's shusage (calculated above) as
+ * input to update the user-mode priority (umdpri) of the runnable threads.
+ * This can cause the threads to change their position in the run queue.
+ *
+ * First the process's priority is decayed using the following formula:
+ *
+ * fsspri * fss_nice_decay[nice_value])
+ * fsspri = ------------------------------------
+ * FSS_DECAY_BASE
+ *
+ * Then the process's new fsspri is calculated in the fss_newpri function,
+ * using the following formula. All runnable threads in the project will use
+ * the same shusage and nrunnable values in their calculation.
+ *
+ * fsspri = fsspri + shusage * nrunnable * ticks
+ *
+ * - shusage is the project's share usage, calculated above
+ * - nrunnable is the number of runnable threads in the project
+ * - ticks is the number of ticks this thread ran since the last fss_newpri
+ * invocation.
+ *
+ * Finally the process's new umdpri is calculated using the following
+ * formula:
+ *
+ * (fsspri * umdprirange)
+ * umdpri = maxumdpri - ------------------------
+ * maxfsspri
+ *
+ * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59)
+ * - umdprirange is maxumdpri - 1 (i.e. 58)
+ * - maxfsspri is the largest fsspri seen so far, as we're iterating all
+ * runnable processes
+ *
+ * This code has various checks to ensure the resulting umdpri is in the
+ * range 1-59. See fss_newpri for more details.
+ *
+ * To reiterate, the above processing is performed once per second to recompute
+ * the runnable thread priorities.
+ *
+ * 3) The final major component in the priority calculation is the tick
+ * processing which occurs on a process that is running when the scheduler
+ * calls fss_tick.
+ *
+ * A thread can run continuously in user-land (compute-bound) for the
+ * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties).
+ * Once the quantum has been consumed, the thread will call fss_newpri to
+ * recompute its umdpri priority, as described above. To ensure that
+ * runnable threads within a project see the expected round-robin behavior,
+ * there is a special case in fss_newpri for a thread that has run for its
+ * quanta within the one second update interval. See the handling for the
+ * quanta_up parameter within fss_newpri.
+ *
+ * Also of interest, the fss_tick code increments the project's tick counter
+ * using the fss_nice_tick array value for the thread's nice value. The idea
+ * behind the fss_nice_tick array is that the cost of a tick is lower at
+ * positive nice values (so that it doesn't increase the project's shusage
+ * as much as normal) with a 50% drop at the maximum level and a 50%
+ * increase at the minimum level. The fss_nice_tick array is initialized in
+ * fss_init using the following formula:
+ *
+ * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i)
+ * --------------------------------------------------
+ * FSS_NICE_RANGE
+ *
+ * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0
+ *
* FSS Data Structures:
*
* fsszone
@@ -72,7 +219,6 @@
* ----- ----- -----
* fssproj
*
- *
* That is, fsspsets contain a list of fsszone's that are currently active in
* the pset, and a list of fssproj's, corresponding to projects with runnable
* threads on the pset. fssproj's in turn point to the fsszone which they
@@ -81,12 +227,6 @@
* An fssproj_t is removed when there are no threads in it.
*
* An fsszone_t is removed when there are no projects with threads in it.
- *
- * Projects in a zone compete with each other for cpu time, receiving cpu
- * allocation within a zone proportional to fssproj->fssp_shares
- * (project.cpu-shares); at a higher level zones compete with each other,
- * receiving allocation in a pset proportional to fsszone->fssz_shares
- * (zone.cpu-shares). See fss_decay_usage() for the precise formula.
*/
static pri_t fss_init(id_t, int, classfuncs_t **);
@@ -186,7 +326,7 @@ static time_t fss_minrun = 2; /* t_pri becomes 59 within 2 secs */
static time_t fss_minslp = 2; /* min time on sleep queue for hardswap */
static int fss_quantum = 11;
-static void fss_newpri(fssproc_t *);
+static void fss_newpri(fssproc_t *, boolean_t);
static void fss_update(void *);
static int fss_update_list(int);
static void fss_change_priority(kthread_t *, fssproc_t *);
@@ -720,15 +860,53 @@ fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
/*
* Calculate the new cpupri based on the usage, the number of shares and
* the number of active threads. Reset the tick counter for this thread.
+ *
+ * When calculating the new priority using the standard formula we can hit
+ * a scenario where we don't have good round-robin behavior. This would be
+ * most commonly seen when there is a zone with lots of runnable threads.
+ * In the bad scenario we will see the following behavior when using the
+ * standard formula and these conditions:
+ *
+ * - there are multiple runnable threads in the zone (project)
+ * - the fssps_maxfsspri is a very large value
+ * - (we also know all of these threads will use the project's
+ * fssp_shusage)
+ *
+ * Under these conditions, a thread with a low fss_fsspri value is chosen
+ * to run and the thread gets a high fss_umdpri. This thread can run for
+ * its full quanta (fss_timeleft) at which time fss_newpri is called to
+ * calculate the thread's new priority.
+ *
+ * In this case, because the newly calculated fsspri value is much smaller
+ * (orders of magnitude) than the fssps_maxfsspri value, if we used the
+ * standard formula the thread will still get a high fss_umdpri value and
+ * will run again for another quanta, even though there are other runnable
+ * threads in the project.
+ *
+ * For a thread that is runnable for a long time, the thread can continue
+ * to run for many quanta (totaling many seconds) before the thread's fsspri
+ * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back
+ * down to 1. This behavior also keeps the fssps_maxfsspr at a high value,
+ * so that the next runnable thread might repeat this cycle.
+ *
+ * This leads to the case where we don't have round-robin behavior at quanta
+ * granularity, but instead, runnable threads within the project only run
+ * at several second intervals.
+ *
+ * To prevent this scenario from occuring, when a thread has consumed its
+ * quanta and there are multiple runnable threads in the project, we
+ * immediately cause the thread to hit fssps_maxfsspri so that it gets
+ * reset back to 1 and another runnable thread in the project can run.
*/
static void
-fss_newpri(fssproc_t *fssproc)
+fss_newpri(fssproc_t *fssproc, boolean_t quanta_up)
{
kthread_t *tp;
fssproj_t *fssproj;
fsspset_t *fsspset;
fsszone_t *fsszone;
fsspri_t fsspri, maxfsspri;
+ uint32_t n_runnable;
pri_t invpri;
uint32_t ticks;
@@ -761,13 +939,21 @@ fss_newpri(fssproc_t *fssproc)
return;
}
- /*
- * fsspri += shusage * nrunnable * ticks
- */
ticks = fssproc->fss_ticks;
fssproc->fss_ticks = 0;
- fsspri = fssproc->fss_fsspri;
- fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks;
+ maxfsspri = fsspset->fssps_maxfsspri;
+ n_runnable = fssproj->fssp_runnable;
+
+ if (quanta_up && n_runnable > 1) {
+ fsspri = maxfsspri;
+ } else {
+ /*
+ * fsspri += shusage * nrunnable * ticks
+ */
+ fsspri = fssproc->fss_fsspri;
+ fsspri += fssproj->fssp_shusage * n_runnable * ticks;
+ }
+
fssproc->fss_fsspri = fsspri;
if (fsspri < fss_maxumdpri)
@@ -788,7 +974,6 @@ fss_newpri(fssproc_t *fssproc)
* values; if it is changed, additional checks may need to be
* added.
*/
- maxfsspri = fsspset->fssps_maxfsspri;
if (fsspri >= maxfsspri) {
fsspset->fssps_maxfsspri = fsspri;
disp_lock_exit_high(&fsspset->fssps_displock);
@@ -814,6 +999,7 @@ fss_decay_usage()
fsszone_t *fsszone;
fsspri_t maxfsspri;
int psetid;
+ struct zone *zp;
mutex_enter(&fsspsets_lock);
/*
@@ -824,6 +1010,8 @@ fss_decay_usage()
fsspset = &fsspsets[psetid];
mutex_enter(&fsspset->fssps_lock);
+ fsspset->fssps_gen++;
+
if (fsspset->fssps_cpupart == NULL ||
(fssproj = fsspset->fssps_list) == NULL) {
mutex_exit(&fsspset->fssps_lock);
@@ -843,6 +1031,21 @@ fss_decay_usage()
fsspset->fssps_maxfsspri = maxfsspri;
do {
+ fsszone = fssproj->fssp_fsszone;
+ zp = fsszone->fssz_zone;
+
+ /*
+ * Reset zone's FSS kstats if they are from a
+ * previous cycle.
+ */
+ if (fsspset->fssps_gen != zp->zone_fss_gen) {
+ zp->zone_fss_gen = fsspset->fssps_gen;
+ zp->zone_fss_pri_hi = 0;
+ zp->zone_runq_cntr = 0;
+ zp->zone_fss_shr_pct = 0;
+ zp->zone_proc_cnt = 0;
+ }
+
/*
* Decay usage for each project running on
* this cpu partition.
@@ -850,9 +1053,18 @@ fss_decay_usage()
fssproj->fssp_usage =
(fssproj->fssp_usage * FSS_DECAY_USG) /
FSS_DECAY_BASE + fssproj->fssp_ticks;
+
fssproj->fssp_ticks = 0;
- fsszone = fssproj->fssp_fsszone;
+ zp->zone_run_ticks += fssproj->fssp_zone_ticks;
+ /*
+ * This is the count for this one second cycle only,
+ * and not cumulative.
+ */
+ zp->zone_runq_cntr += fssproj->fssp_runnable;
+
+ fssproj->fssp_zone_ticks = 0;
+
/*
* Readjust the project's number of shares if it has
* changed since we checked it last time.
@@ -871,7 +1083,7 @@ fss_decay_usage()
* Readjust the zone's number of shares if it
* has changed since we checked it last time.
*/
- zone_ext_shares = fsszone->fssz_zone->zone_shares;
+ zone_ext_shares = zp->zone_shares;
if (fsszone->fssz_rshares != zone_ext_shares) {
if (fsszone->fssz_runnable != 0) {
fsspset->fssps_shares -=
@@ -883,6 +1095,12 @@ fss_decay_usage()
}
zone_int_shares = fsszone->fssz_shares;
pset_shares = fsspset->fssps_shares;
+
+ if (zp->zone_runq_cntr > 0 && pset_shares > 0)
+ /* in tenths of a pct */
+ zp->zone_fss_shr_pct =
+ (zone_ext_shares * 1000) / pset_shares;
+
/*
* Calculate fssp_shusage value to be used
* for fsspri increments for the next second.
@@ -1050,6 +1268,8 @@ fss_update_list(int i)
fssproc_t *fssproc;
fssproj_t *fssproj;
fsspri_t fsspri;
+ struct zone *zp;
+ pri_t fss_umdpri;
kthread_t *t;
int updated = 0;
@@ -1073,6 +1293,7 @@ fss_update_list(int i)
fssproj = FSSPROC2FSSPROJ(fssproc);
if (fssproj == NULL)
goto next;
+
if (fssproj->fssp_shares != 0) {
/*
* Decay fsspri value.
@@ -1093,14 +1314,31 @@ fss_update_list(int i)
aston(t);
goto next;
}
- fss_newpri(fssproc);
+ fss_newpri(fssproc, B_FALSE);
updated = 1;
+ fss_umdpri = fssproc->fss_umdpri;
+
+ /*
+ * Summarize a zone's process priorities for runnable
+ * procs.
+ */
+ zp = fssproj->fssp_fsszone->fssz_zone;
+
+ if (fss_umdpri > zp->zone_fss_pri_hi)
+ zp->zone_fss_pri_hi = fss_umdpri;
+
+ if (zp->zone_proc_cnt++ == 0)
+ zp->zone_fss_pri_avg = fss_umdpri;
+ else
+ zp->zone_fss_pri_avg =
+ (zp->zone_fss_pri_avg + fss_umdpri) / 2;
+
/*
* Only dequeue the thread if it needs to be moved; otherwise
* it should just round-robin here.
*/
- if (t->t_pri != fssproc->fss_umdpri)
+ if (t->t_pri != fss_umdpri)
fss_change_priority(t, fssproc);
next:
thread_unlock(t);
@@ -1624,7 +1862,7 @@ fss_forkret(kthread_t *t, kthread_t *ct)
thread_lock(t);
fssproc = FSSPROC(t);
- fss_newpri(fssproc);
+ fss_newpri(fssproc, B_FALSE);
fssproc->fss_timeleft = fss_quantum;
t->t_pri = fssproc->fss_umdpri;
ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
@@ -1725,7 +1963,7 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
fssproc->fss_uprilim = reqfssuprilim;
fssproc->fss_upri = reqfssupri;
fssproc->fss_nice = nice;
- fss_newpri(fssproc);
+ fss_newpri(fssproc, B_FALSE);
if ((fssproc->fss_flags & FSSKPRI) != 0) {
thread_unlock(t);
@@ -2180,6 +2418,7 @@ fss_tick(kthread_t *t)
fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj);
disp_lock_enter_high(&fsspset->fssps_displock);
fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice];
+ fssproj->fssp_zone_ticks++;
fssproc->fss_ticks++;
disp_lock_exit_high(&fsspset->fssps_displock);
}
@@ -2223,7 +2462,7 @@ fss_tick(kthread_t *t)
}
fssproc->fss_flags &= ~FSSRESTORE;
- fss_newpri(fssproc);
+ fss_newpri(fssproc, B_TRUE);
new_pri = fssproc->fss_umdpri;
ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
@@ -2262,7 +2501,7 @@ fss_tick(kthread_t *t)
* queue so that it gets charged for the CPU time from its
* quantum even before that quantum expires.
*/
- fss_newpri(fssproc);
+ fss_newpri(fssproc, B_FALSE);
if (t->t_pri != fssproc->fss_umdpri)
fss_change_priority(t, fssproc);
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 5ed9110251..63a08483f8 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -1049,6 +1049,8 @@ installctx(
ctx->free_op = free;
ctx->arg = arg;
ctx->next = t->t_ctx;
+ ctx->save_ts = 0;
+ ctx->restore_ts = 0;
t->t_ctx = ctx;
}
@@ -1120,9 +1122,12 @@ savectx(kthread_t *t)
struct ctxop *ctx;
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->save_op != NULL)
+ for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+ if (ctx->save_op != NULL) {
+ ctx->save_ts = gethrtime_unscaled();
(ctx->save_op)(ctx->arg);
+ }
+ }
}
void
@@ -1131,9 +1136,12 @@ restorectx(kthread_t *t)
struct ctxop *ctx;
ASSERT(t == curthread);
- for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
- if (ctx->restore_op != NULL)
+ for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) {
+ if (ctx->restore_op != NULL) {
+ ctx->restore_ts = gethrtime_unscaled();
(ctx->restore_op)(ctx->arg);
+ }
+ }
}
void
diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c
index 0c5e4b3a01..a5dd75f944 100644
--- a/usr/src/uts/common/dtrace/dtrace.c
+++ b/usr/src/uts/common/dtrace/dtrace.c
@@ -745,7 +745,7 @@ static int
dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
- size_t sz;
+ size_t sz, strsize;
ASSERT(type->dtdt_flags & DIF_TF_BYREF);
/*
@@ -755,11 +755,24 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
return (1);
- if (type->dtdt_kind == DIF_TYPE_STRING)
- sz = dtrace_strlen(src,
- vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
- else
+ if (type->dtdt_kind == DIF_TYPE_STRING) {
+ dtrace_state_t *state = vstate->dtvs_state;
+
+ if (state != NULL) {
+ strsize = state->dts_options[DTRACEOPT_STRSIZE];
+ } else {
+ /*
+ * In helper context, we have a NULL state; fall back
+ * to using the system-wide default for the string size
+ * in this case.
+ */
+ strsize = dtrace_strsize_default;
+ }
+
+ sz = dtrace_strlen(src, strsize) + 1;
+ } else {
sz = type->dtdt_size;
+ }
return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
}
@@ -6679,7 +6692,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
priv = DTRACE_PRIV_ALL;
} else {
*uidp = crgetuid(cr);
- *zoneidp = crgetzoneid(cr);
+ *zoneidp = crgetzonedid(cr);
priv = 0;
if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
@@ -7175,7 +7188,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
provider->dtpv_priv.dtpp_flags = priv;
if (cr != NULL) {
provider->dtpv_priv.dtpp_uid = crgetuid(cr);
- provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
+ provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr);
}
provider->dtpv_pops = *pops;
@@ -7786,6 +7799,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
uint32_t priv;
uid_t uid;
zoneid_t zoneid;
+ dtrace_state_t *state = enab->dten_vstate->dtvs_state;
ASSERT(MUTEX_HELD(&dtrace_lock));
dtrace_ecb_create_cache = NULL;
@@ -7800,8 +7814,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
}
dtrace_probekey(desc, &pkey);
- dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
- &priv, &uid, &zoneid);
+ dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid);
+
+ if ((priv & DTRACE_PRIV_ZONEOWNER) &&
+ state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) {
+ /*
+ * If we have the privilege of instrumenting all zones but we
+ * have been told to instrument but one, we will spoof this up
+ * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes
+ * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for
+ * security but rather for performance: it allows the global
+ * zone to instrument USDT probes in a local zone without
+ * requiring all zones to be instrumented.)
+ */
+ priv &= ~DTRACE_PRIV_ZONEOWNER;
+ zoneid = state->dts_options[DTRACEOPT_ZONE];
+ }
return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
enab));
diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c
index fb1d93d06b..89c5decbf0 100644
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c
@@ -1142,9 +1142,21 @@ sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
struct sdev_node *parent = VTOSDEV(dvp);
int error;
- /* execute access is required to search the directory */
- if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
- return (error);
+ /*
+ * We must check that we have execute access to search the directory --
+ * but because our sdev_contents lock is already held as a reader (the
+ * caller must have done a VOP_RWLOCK()), we call directly into the
+ * underlying access routine if sdev_attr is non-NULL.
+ */
+ if (parent->sdev_attr != NULL) {
+ VERIFY(RW_READ_HELD(&parent->sdev_contents));
+
+ if (sdev_unlocked_access(parent, VEXEC, cred) != 0)
+ return (EACCES);
+ } else {
+ if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+ return (error);
+ }
ASSERT(parent);
if (!SDEV_IS_GLOBAL(parent))
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..f7f2944a2e
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+ vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define HL_HASH_SIZE 8192 /* must be power of 2 */
+#define HL_MUTEX_SIZE 64
+
+static hldirent_t *hl_hashtable[HL_HASH_SIZE];
+static kmutex_t hl_hashmutex[HL_MUTEX_SIZE];
+
+#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1))
+#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1))
+
+#define HYPRLOFS_HASH(tp, name, hash) \
+ { \
+ char Xc, *Xcp; \
+ hash = (uint_t)(uintptr_t)(tp) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + (uint_t)Xc; \
+ }
+
+void
+hyprlofs_hash_init(void)
+{
+ int ix;
+
+ for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+ mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+ h->hld_hash = hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ h->hld_link = *prevpp;
+ *prevpp = h;
+ mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ hash = h->hld_hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ while (*prevpp != h)
+ prevpp = &(*prevpp)->hld_link;
+ *prevpp = h->hld_link;
+ mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+ hlnode_t **found)
+{
+ hldirent_t *l;
+ uint_t hash;
+ kmutex_t *hmtx;
+ hlnode_t *hnp;
+
+ HYPRLOFS_HASH(parent, name, hash);
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ l = hl_hashtable[HL_HASH_INDEX(hash)];
+ while (l) {
+ if (l->hld_hash == hash && l->hld_parent == parent &&
+ strcmp(l->hld_name, name) == 0) {
+ /*
+ * Ensure that the hlnode that we put a hold on is the
+ * same one that we pass back. Thus the temp. var
+ * hnp is necessary.
+ */
+ hnp = l->hld_hlnode;
+ if (hold) {
+ ASSERT(hnp);
+ hlnode_hold(hnp);
+ }
+ if (found)
+ *found = hnp;
+ mutex_exit(hmtx);
+ return (l);
+ } else {
+ l = l->hld_link;
+ }
+ }
+ mutex_exit(hmtx);
+ return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+ int error;
+
+ *foundtp = NULL;
+ if (parent->hln_type != VDIR)
+ return (ENOTDIR);
+
+ if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+ return (error);
+
+ if (*name == '\0') {
+ hlnode_hold(parent);
+ *foundtp = parent;
+ return (0);
+ }
+
+ /*
+ * Search the directory for the matching name. We need the lock
+ * protecting the hln_dir list so that it doesn't change out from
+ * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+ * with a hold on it.
+ */
+ if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+ ASSERT(*foundtp);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+ hlfsmount_t *hm,
+ hlnode_t *dir, /* target directory to make entry in */
+ char *name, /* name of entry */
+ enum de_op op, /* entry operation */
+ vnode_t *realvp, /* real vnode */
+ vattr_t *va,
+ hlnode_t **hpp, /* return hlnode */
+ cred_t *cr)
+{
+ hldirent_t *hdp;
+ hlnode_t *found = NULL;
+ hlnode_t *hp;
+ int error = 0;
+ char *s;
+
+ /* hln_rwlock is held to serialize direnter and dirdeletes */
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ /* Don't allow '/' characters in pathname component */
+ for (s = name; *s; s++)
+ if (*s == '/')
+ return (EACCES);
+
+ if (name[0] == '\0')
+ panic("hyprlofs_direnter: NULL name");
+
+ /*
+ * This might be a "dangling detached directory". It could have been
+ * removed, but a reference to it kept in u_cwd. Don't bother searching
+ * it, and with any luck the user will get tired of dealing with us and
+ * cd to some absolute pathway. This is in ufs, too.
+ */
+ if (dir->hln_nlink == 0) {
+ return (ENOENT);
+ }
+
+ /* Search for the entry. Return "found" if it exists. */
+ hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+ if (hdp) {
+ ASSERT(found);
+ switch (op) {
+ case DE_CREATE:
+ case DE_MKDIR:
+ if (hpp) {
+ *hpp = found;
+ error = EEXIST;
+ } else {
+ hlnode_rele(found);
+ }
+ break;
+ }
+ } else {
+
+ /*
+ * The entry does not exist. Check write perms in dir to see if
+ * entry can be created.
+ */
+ if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+ return (error);
+
+ /* Make new hlnode and directory entry as required. */
+ if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+ cr)))
+ return (error);
+
+ if ((error = hldiraddentry(dir, hp, name))) {
+ /* Unmake the inode we just made. */
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ if ((hp->hln_type) == VDIR) {
+ ASSERT(hdp == NULL);
+ /* cleanup allocs made by hyprlofs_dirinit() */
+ hyprlofs_dirtrunc(hp);
+ }
+ mutex_enter(&hp->hln_tlock);
+ hp->hln_nlink = 0;
+ mutex_exit(&hp->hln_tlock);
+ gethrestime(&hp->hln_ctime);
+ rw_exit(&hp->hln_rwlock);
+ hlnode_rele(hp);
+ hp = NULL;
+ } else if (hpp) {
+ *hpp = hp;
+ } else {
+ hlnode_rele(hp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+ cred_t *cr)
+{
+ hldirent_t *hpdp;
+ int error;
+ size_t namelen;
+ hlnode_t *hnp;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (nm[0] == '\0')
+ panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+ /* return error if removing . or .. */
+ if (nm[0] == '.') {
+ if (nm[1] == '\0')
+ return (EINVAL);
+ if (nm[1] == '.' && nm[2] == '\0')
+ return (EEXIST); /* thus in ufs */
+ }
+
+ if (error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr))
+ return (error);
+
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+ if (hpdp == NULL) {
+ /*
+ * If it is gone, some other thread got here first!
+ * Return error ENOENT.
+ */
+ return (ENOENT);
+ }
+
+ /*
+ * If the hlnode in the hldirent changed (shouldn't happen since we
+ * don't support rename) then original is gone, so return that status
+ * (same as UFS).
+ */
+ if (hp != hnp)
+ return (ENOENT);
+
+ hyprlofs_hash_out(hpdp);
+
+ /* Take hpdp out of the directory list. */
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ if (hpdp->hld_prev) {
+ hpdp->hld_prev->hld_next = hpdp->hld_next;
+ }
+ if (hpdp->hld_next) {
+ hpdp->hld_next->hld_prev = hpdp->hld_prev;
+ }
+
+ /*
+ * If the roving slot pointer happens to match hpdp, point it at the
+ * previous dirent.
+ */
+ if (dir->hln_dir->hld_prev == hpdp) {
+ dir->hln_dir->hld_prev = hpdp->hld_prev;
+ }
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ /* hpdp points to the correct directory entry */
+ namelen = strlen(hpdp->hld_name) + 1;
+
+ hyprlofs_memfree(hpdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+ hp->hln_ctime = now;
+
+ ASSERT(hp->hln_nlink > 0);
+ DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+ if (op == DR_RMDIR && hp->hln_type == VDIR) {
+ hyprlofs_dirtrunc(hp);
+ ASSERT(hp->hln_nlink == 0);
+ }
+ return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+ hlnode_t *parent, /* parent of directory to initialize */
+ hlnode_t *dir) /* the new directory */
+{
+ hldirent_t *dot, *dotdot;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ dot = hyprlofs_memalloc(sizeof (hldirent_t) + 2, HL_MUSTHAVE);
+ dotdot = hyprlofs_memalloc(sizeof (hldirent_t) + 3, HL_MUSTHAVE);
+
+ /* Initialize the entries */
+ dot->hld_hlnode = dir;
+ dot->hld_offset = 0;
+ dot->hld_name = (char *)dot + sizeof (hldirent_t);
+ dot->hld_name[0] = '.';
+ dot->hld_parent = dir;
+ hyprlofs_hash_in(dot);
+
+ dotdot->hld_hlnode = parent;
+ dotdot->hld_offset = 1;
+ dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+ dotdot->hld_name[0] = '.';
+ dotdot->hld_name[1] = '.';
+ dotdot->hld_parent = dir;
+ hyprlofs_hash_in(dotdot);
+
+ /* Initialize directory entry list. */
+ dot->hld_next = dotdot;
+ dot->hld_prev = dotdot;
+ dotdot->hld_next = NULL;
+ dotdot->hld_prev = dot;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ /*
+ * Since hyprlofs_dirinit is called with both dir and parent being the
+ * same for the root vnode, we need to increment this before we set
+ * hln_nlink = 2 below.
+ */
+ INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+ parent->hln_ctime = now;
+
+ dir->hln_dir = dot;
+ dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+ dir->hln_dirents = 2;
+ dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+ hldirent_t *hdp;
+ hlnode_t *tp;
+ size_t namelen;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (dir->hln_looped)
+ return;
+
+ for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hdp->hld_hlnode);
+
+ dir->hln_dir = hdp->hld_next;
+ namelen = strlen(hdp->hld_name) + 1;
+
+ /*
+ * Adjust the link counts to account for this dir entry removal.
+ */
+ tp = hdp->hld_hlnode;
+
+ ASSERT(tp->hln_nlink > 0);
+ DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+ hyprlofs_hash_out(hdp);
+
+ hyprlofs_memfree(hdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+ }
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ ASSERT(dir->hln_dir == NULL);
+ ASSERT(dir->hln_size == 0);
+ ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+ hlnode_t *dir, /* target directory to make entry in */
+ hlnode_t *hp, /* new hlnode */
+ char *name)
+{
+ hldirent_t *hdp, *hpdp;
+ size_t namelen, alloc_size;
+ timestruc_t now;
+
+ /*
+ * Make sure the parent dir wasn't removed from underneath the caller.
+ */
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ /* Check that everything is on the same FS. */
+ if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+ return (EXDEV);
+
+ /* Alloc and init dir entry */
+ namelen = strlen(name) + 1;
+ alloc_size = namelen + sizeof (hldirent_t);
+ hdp = hyprlofs_memalloc(alloc_size, 0);
+ if (hdp == NULL)
+ return (ENOSPC);
+
+ dir->hln_size += alloc_size;
+ dir->hln_dirents++;
+ hdp->hld_hlnode = hp;
+ hdp->hld_parent = dir;
+
+ /* The dir entry and its name were allocated sequentially. */
+ hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+ (void) strcpy(hdp->hld_name, name);
+
+ hyprlofs_hash_in(hdp);
+
+ /*
+ * Some utilities expect the size of a directory to remain fairly
+ * static. For example, a routine which unlinks files between calls to
+ * readdir(); the size of the dir changes from underneath it and so the
+ * real dir offset in bytes is invalid. To circumvent this problem, we
+ * initialize a dir entry with a phony offset, and use this offset to
+ * determine end of file in hyprlofs_readdir.
+ */
+ hpdp = dir->hln_dir->hld_prev;
+ /*
+ * Install at first empty "slot" in directory list.
+ */
+ while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+ hpdp->hld_offset) <= 1) {
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+ hpdp = hpdp->hld_next;
+ }
+ hdp->hld_offset = hpdp->hld_offset + 1;
+
+ /*
+ * If we're at the end of the dirent list and the offset (which is
+ * necessarily the largest offset in this dir) is more than twice the
+ * number of dirents, that means the dir is 50% holes. At this point
+ * we reset the slot pointer back to the beginning of the dir so we
+ * start using the holes. The idea is that if there are N dirents,
+ * there must also be N holes, so we can satisfy the next N creates by
+ * walking at most 2N entries; thus the average cost of a create is
+ * constant. Note that we use the first dirent's hld_prev as the roving
+ * slot pointer. This saves a word in every dirent.
+ */
+ if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+ dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+ else
+ dir->hln_dir->hld_prev = hdp;
+
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ hdp->hld_next = hpdp->hld_next;
+ if (hdp->hld_next) {
+ hdp->hld_next->hld_prev = hdp;
+ }
+ hdp->hld_prev = hpdp;
+ hpdp->hld_next = hdp;
+
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+ vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+ hlnode_t *hp;
+ enum vtype type;
+
+ ASSERT(va != NULL);
+ ASSERT(op == DE_CREATE || op == DE_MKDIR);
+ if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+ ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+ return (EOVERFLOW);
+ type = va->va_type;
+ hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+ hyprlofs_node_init(hm, hp, va, cr);
+
+ hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+ hp->hln_vnode->v_type = type;
+ hp->hln_uid = crgetuid(cr);
+
+ /*
+ * To determine the gid of the created file:
+ * If the directory's set-gid bit is set, set the gid to the gid
+ * of the parent dir, otherwise, use the process's gid.
+ */
+ if (dir->hln_mode & VSGID)
+ hp->hln_gid = dir->hln_gid;
+ else
+ hp->hln_gid = crgetgid(cr);
+
+ /*
+ * If we're creating a dir and the parent dir has the set-GID bit set,
+ * set it on the new dir. Otherwise, if the user is neither privileged
+ * nor a member of the file's new group, clear the file's set-GID bit.
+ */
+ if (dir->hln_mode & VSGID && type == VDIR)
+ hp->hln_mode |= VSGID;
+ else {
+ if ((hp->hln_mode & VSGID) &&
+ secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+ hp->hln_mode &= ~VSGID;
+ }
+
+ if (va->va_mask & AT_ATIME)
+ hp->hln_atime = va->va_atime;
+ if (va->va_mask & AT_MTIME)
+ hp->hln_mtime = va->va_mtime;
+
+ if (op == DE_MKDIR) {
+ hyprlofs_dirinit(dir, hp);
+ hp->hln_looped = 0;
+ } else {
+ hp->hln_realvp = realvp;
+ hp->hln_size = va->va_size;
+ hp->hln_looped = 1;
+ }
+
+ *newnode = hp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..bf71b2bfcb
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define MODESHIFT 3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+ vnode_t *vp;
+ timestruc_t now;
+
+ ASSERT(vap != NULL);
+
+ rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+ h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ h->hln_mask = 0;
+ h->hln_type = vap->va_type;
+ h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+ h->hln_nlink = 1;
+ h->hln_size = 0;
+
+ if (cr == NULL) {
+ h->hln_uid = vap->va_uid;
+ h->hln_gid = vap->va_gid;
+ } else {
+ h->hln_uid = crgetuid(cr);
+ h->hln_gid = crgetgid(cr);
+ }
+
+ h->hln_fsid = hm->hlm_dev;
+ h->hln_rdev = vap->va_rdev;
+ h->hln_blksize = PAGESIZE;
+ h->hln_nblocks = 0;
+ gethrestime(&now);
+ h->hln_atime = now;
+ h->hln_mtime = now;
+ h->hln_ctime = now;
+ h->hln_seq = 0;
+ h->hln_dir = NULL;
+
+ h->hln_vnode = vn_alloc(KM_SLEEP);
+ vp = HLNTOV(h);
+ vn_setops(vp, hyprlofs_vnodeops);
+ vp->v_vfsp = hm->hlm_vfsp;
+ vp->v_type = vap->va_type;
+ vp->v_rdev = vap->va_rdev;
+ vp->v_data = (caddr_t)h;
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * Increment the pseudo generation number for this hlnode. Since
+ * hlnodes are allocated and freed, there really is no particular
+ * generation number for a new hlnode. Just fake it by using a
+ * counter in each file system.
+ */
+ h->hln_gen = hm->hlm_gen++;
+
+ /*
+ * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+ * Root dir is handled specially in hyprlofs_mount.
+ */
+ if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+ h->hln_forw = NULL;
+ h->hln_back = hm->hlm_rootnode->hln_back;
+ h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+ }
+ mutex_exit(&hm->hlm_contents);
+ vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+ hlnode_t *hp = vtp;
+ int shift = 0;
+
+ /* Check access based on owner, group and public perms in hlnode. */
+ if (crgetuid(cr) != hp->hln_uid) {
+ shift += MODESHIFT;
+ if (groupmember(hp->hln_gid, cr) == 0)
+ shift += MODESHIFT;
+ }
+
+ return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+ hp->hln_mode << shift, mode));
+}
+
+/*
+ * Allocate zeroed memory if hyprlofs_maxkmem has not been exceeded or the
+ * 'musthave' flag is set. 'musthave' allocations should always be subordinate
+ * to normal allocations so that hyprlofs_maxkmem can't be exceeded by more
+ * than a few KB. E.g. when creating a new dir, the hlnode is a normal
+ * allocation; if that succeeds, the dirents for "." and ".." are 'musthave'
+ * allocations.
+ */
+void *
+hyprlofs_memalloc(size_t size, int musthave)
+{
+ if (atomic_add_long_nv(&hyprlofs_kmemspace, size) < hyprlofs_maxkmem ||
+ musthave)
+ return (kmem_zalloc(size, KM_SLEEP));
+
+ atomic_add_long(&hyprlofs_kmemspace, -size);
+ cmn_err(CE_WARN, "hyprlofs over memory limit");
+ return (NULL);
+}
+
+void
+hyprlofs_memfree(void *cp, size_t size)
+{
+ kmem_free(cp, size);
+ atomic_add_long(&hyprlofs_kmemspace, -size);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..afe76d0629
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,625 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems. It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute. Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses. Entries in the name space are not mounts and
+ * thus do not appear in the mnttab. Entries in the name space are allowed to
+ * refer to files on different backing file systems. Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications. That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place. It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space. The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed. The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /. The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "hyprlofs",
+ hyprlofsinit,
+ VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+ &hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+ 0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+ &mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+ int error;
+
+ error = mod_remove(&modlinkage);
+ if (error)
+ return (error);
+ /*
+ * Tear down the operations vectors
+ */
+ (void) vfs_freevfsops_by_type(hyprlofsfstype);
+ vn_freevnodeops(hyprlofs_vnodeops);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+ static const fs_operation_def_t hl_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount },
+ VFSNAME_ROOT, { .vfs_root = hyprlofs_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs },
+ VFSNAME_VGET, { .vfs_vget = hyprlofs_vget },
+ NULL, NULL
+ };
+ int error;
+ extern void hyprlofs_hash_init();
+
+ hyprlofs_hash_init();
+ hyprlofsfstype = fstype;
+ ASSERT(hyprlofsfstype != 0);
+
+ error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+ return (error);
+ }
+
+ error = vn_make_ops(name, hyprlofs_vnodeops_template,
+ &hyprlofs_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * hyprlofs_minfree is an absolute limit of swap space which still
+ * allows other processes to execute. Set it if its not patched.
+ */
+ if (hyprlofs_minfree == 0)
+ hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+ /*
+ * The maximum amount of space hyprlofs can allocate is
+ * HYPRLOFSMAXPROCKMEM percent of kernel memory
+ */
+ if (hyprlofs_maxkmem == 0)
+ hyprlofs_maxkmem =
+ MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM);
+
+ if ((hyprlofs_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "hyprlofsinit: Can't get unique device number.");
+ hyprlofs_major = 0;
+ }
+ mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ hlfsmount_t *hm = NULL;
+ hlnode_t *hp;
+ struct pathname dpn;
+ int error;
+ vattr_t rattr;
+ int got_attrs;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uap->flags & MS_REMOUNT)
+ return (EBUSY);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /* Having the resource be anything but "swap" doesn't make sense. */
+ vfs_setresource(vfsp, "swap", 0);
+
+ if (error = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
+ goto out;
+
+ if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) {
+ pn_free(&dpn);
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Get an available minor device number for this mount */
+ mutex_enter(&hyprlofs_minor_lock);
+ do {
+ hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+ hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+ } while (vfs_devismounted(hm->hlm_dev));
+ mutex_exit(&hyprlofs_minor_lock);
+
+ /*
+ * Set but don't bother entering the mutex since hlfsmount is not on
+ * the mount list yet.
+ */
+ mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+ hm->hlm_vfsp = vfsp;
+
+ vfsp->vfs_data = (caddr_t)hm;
+ vfsp->vfs_fstype = hyprlofsfstype;
+ vfsp->vfs_dev = hm->hlm_dev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+ hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE);
+ (void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+ /* allocate and initialize root hlnode structure */
+ bzero(&rattr, sizeof (vattr_t));
+ rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+ rattr.va_type = VDIR;
+ rattr.va_rdev = 0;
+ hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE);
+ hyprlofs_node_init(hm, hp, &rattr, cr);
+
+ /* Get the mode, uid, and gid from the underlying mount point. */
+ rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ HLNTOV(hp)->v_flag |= VROOT;
+
+ /*
+ * If the getattr succeeded, use its results, otherwise allow the
+ * previously set defaults to prevail.
+ */
+ if (got_attrs == 0) {
+ hp->hln_mode = rattr.va_mode;
+ hp->hln_uid = rattr.va_uid;
+ hp->hln_gid = rattr.va_gid;
+ }
+
+ /*
+ * Initialize linked list of hlnodes so that the back pointer of the
+ * root hlnode always points to the last one on the list and the
+ * forward pointer of the last node is null
+ */
+ hp->hln_back = hp;
+ hp->hln_forw = NULL;
+ hp->hln_nlink = 0;
+ hm->hlm_rootnode = hp;
+
+ hyprlofs_dirinit(hp, hp);
+
+ rw_exit(&hp->hln_rwlock);
+
+ pn_free(&dpn);
+ error = 0;
+
+out:
+ return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hnp, *cancel;
+ vnode_t *vp;
+ int error;
+
+ if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ /*
+ * forced unmount is not supported by this file system
+ * and thus, ENOTSUP, is being returned.
+ */
+ if (flag & MS_FORCE)
+ return (ENOTSUP);
+
+ mutex_enter(&hm->hlm_contents);
+
+ /*
+ * If there are no open files, only the root node should have a ref cnt.
+ * With hlm_contents held, nothing can be added or removed. There may
+ * be some dirty pages. To prevent fsflush from disrupting the unmount,
+ * put a hold on each node while scanning. If we find a previously
+ * referenced node, undo the holds we have placed and fail EBUSY.
+ */
+ hnp = hm->hlm_rootnode;
+ if (HLNTOV(hnp)->v_count > 1) {
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+
+ for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+ if ((vp = HLNTOV(hnp))->v_count > 0) {
+ cancel = hm->hlm_rootnode->hln_forw;
+ while (cancel != hnp) {
+ vp = HLNTOV(cancel);
+ ASSERT(vp->v_count > 0);
+ VN_RELE(vp);
+ cancel = cancel->hln_forw;
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+ VN_HOLD(vp);
+ }
+
+ /* We can drop the mutex now because no one can find this mount */
+ mutex_exit(&hm->hlm_contents);
+
+ /*
+ * Free all alloc'd memory associated with this FS. To do this, we go
+ * through the file list twice, once to remove all the dir entries, and
+ * then to remove all the files.
+ */
+
+ /* Remove all directory entries */
+ for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+ rw_enter(&hnp->hln_rwlock, RW_WRITER);
+ if (hnp->hln_type == VDIR)
+ hyprlofs_dirtrunc(hnp);
+ rw_exit(&hnp->hln_rwlock);
+ }
+
+ ASSERT(hm->hlm_rootnode);
+
+ /*
+ * All links are gone, v_count is keeping nodes in place. VN_RELE
+ * should make the node disappear, unless somebody is holding pages
+ * against it. Wait and retry until it disappears.
+ *
+ * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+ * from blowing it away (in hyprlofs_inactive) while we're trying to
+ * get to it here. Once we have a HOLD on it we know it'll stick around.
+ */
+ mutex_enter(&hm->hlm_contents);
+
+ /* Remove all the files (except the rootnode) backwards. */
+ while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+ mutex_exit(&hm->hlm_contents);
+ /* Note we handled the link count in pass 2 above. */
+ vp = HLNTOV(hnp);
+ VN_RELE(vp);
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * It's still there after the RELE. Someone else like pageout
+ * has a hold on it so wait a bit and then try again.
+ */
+ if (hnp == hm->hlm_rootnode->hln_back) {
+ VN_HOLD(vp);
+ mutex_exit(&hm->hlm_contents);
+ delay(hz / 4);
+ mutex_enter(&hm->hlm_contents);
+ }
+ }
+ mutex_exit(&hm->hlm_contents);
+
+ VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+ ASSERT(hm->hlm_mntpath);
+
+ hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+ mutex_destroy(&hm->hlm_contents);
+ hyprlofs_memfree(hm, sizeof (hlfsmount_t));
+
+ return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = hm->hlm_rootnode;
+ vnode_t *vp;
+
+ ASSERT(hp);
+
+ vp = HLNTOV(hp);
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ ulong_t blocks;
+ dev32_t d32;
+ zoneid_t eff_zid;
+ struct zone *zp;
+
+ /*
+ * The FS may have been mounted by the GZ on behalf of the NGZ. In
+ * that case, the hlfsmount zone_id will be the global zone. We want
+ * to show the swap cap inside the zone in this case, even though the
+ * FS was mounted by the GZ.
+ */
+ if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+ zp = curproc->p_zone;
+ else
+ zp = hm->hlm_vfsp->vfs_zone;
+
+ if (zp == NULL)
+ eff_zid = GLOBAL_ZONEUNIQID;
+ else
+ eff_zid = zp->zone_id;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ /*
+ * Find the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+
+ if (blocks > hyprlofs_minfree)
+ sbp->f_bfree = blocks - hyprlofs_minfree;
+ else
+ sbp->f_bfree = 0;
+
+ sbp->f_bavail = sbp->f_bfree;
+
+ /*
+ * Total number of blocks is what's available plus what's been used
+ */
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+ if (eff_zid != GLOBAL_ZONEUNIQID &&
+ zp->zone_max_swap_ctl != UINT64_MAX) {
+ /*
+ * If the fs is used by a NGZ with a swap cap, then report the
+ * capped size.
+ */
+ rctl_qty_t cap, used;
+ pgcnt_t pgcap, pgused;
+
+ mutex_enter(&zp->zone_mem_lock);
+ cap = zp->zone_max_swap_ctl;
+ used = zp->zone_max_swap;
+ mutex_exit(&zp->zone_mem_lock);
+
+ pgcap = btop(cap);
+ pgused = btop(used);
+
+ sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+ sbp->f_bavail = sbp->f_bfree;
+ sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+ }
+
+ /*
+ * This is fairly inaccurate since it doesn't take into account the
+ * names stored in the directory entries.
+ */
+ if (hyprlofs_maxkmem > hyprlofs_kmemspace)
+ sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) /
+ (sizeof (hlnode_t) + sizeof (hldirent_t));
+ else
+ sbp->f_ffree = 0;
+
+ sbp->f_files = hyprlofs_maxkmem /
+ (sizeof (hlnode_t) + sizeof (hldirent_t));
+ sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+ (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+ /*
+ * ensure null termination
+ */
+ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+ sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sbp->f_namemax = MAXNAMELEN - 1;
+ return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+ hlfid_t *hfid;
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = NULL;
+
+ hfid = (hlfid_t *)fidp;
+ *vpp = NULL;
+
+ mutex_enter(&hm->hlm_contents);
+ for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+ mutex_enter(&hp->hln_tlock);
+ if (hp->hln_nodeid == hfid->hlfid_ino) {
+ /*
+ * If the gen numbers don't match we know the file
+ * won't be found since only one hlnode can have this
+ * number at a time.
+ */
+ if (hp->hln_gen != hfid->hlfid_gen ||
+ hp->hln_nlink == 0) {
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ *vpp = (vnode_t *)HLNTOV(hp);
+
+ VN_HOLD(*vpp);
+
+ if ((hp->hln_mode & S_ISVTX) &&
+ !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+ mutex_enter(&(*vpp)->v_lock);
+ (*vpp)->v_flag |= VISSWAP;
+ mutex_exit(&(*vpp)->v_lock);
+ }
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ mutex_exit(&hp->hln_tlock);
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..45b7d4db87
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+ caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+ int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call. This is only used to validate that
+ * the input list looks sane.
+ */
+#define MAX_IOCTL_PARAMS 100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *rvp;
+ int error;
+
+ rvp = REALVP(*vpp);
+
+ if (VTOHLN(*vpp)->hln_looped == 0)
+ return (0);
+
+ /*
+ * looped back, pass through to real vnode. Need to hold new reference
+ * to vp since VOP_OPEN() may decide to release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ ASSERT(rvp->v_count > 1);
+ VN_RELE(rvp);
+
+ return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 0) {
+ cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+ cleanshares(vp, ttoproc(curthread)->p_pid);
+ return (0);
+ }
+
+ return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* We don't support writing to non-regular files */
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ if (vn_is_readonly(vp))
+ return (EROFS);
+
+ return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+ cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+ int len, cnt, error;
+ int i;
+ model_t model;
+ char path[MAXPATHLEN];
+ char nm[MAXPATHLEN];
+
+ /* We only support the hyprlofs ioctls on the root vnode */
+ if (!(vp->v_flag & VROOT))
+ return (ENOTTY);
+
+ /*
+ * Check if managing hyprlofs is allowed.
+ */
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_entries_t ebuf;
+ hyprlofs_entry_t *e;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ cnt = ebuf.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry_t) * cnt;
+
+ e = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(ebuf.hle_entries), e, len)) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e[i].hle_nlen == 0 ||
+ e[i].hle_nlen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+ != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ nm[e[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e[i].hle_plen == 0 ||
+ e[i].hle_plen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin(e[i].hle_path, path,
+ e[i].hle_plen) != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ path[e[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e, len);
+ return (0);
+
+ } else {
+ hyprlofs_entries32_t ebuf32;
+ hyprlofs_entry32_t *e32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ cnt = ebuf32.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry32_t) * cnt;
+
+ e32 = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+ e32, len)) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e32[i].hle_nlen == 0 ||
+ e32[i].hle_nlen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_name, nm,
+ e32[i].hle_nlen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ nm[e32[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e32[i].hle_plen == 0 ||
+ e32[i].hle_plen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_path, path,
+ e32[i].hle_plen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ path[e32[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e32, len);
+ return (0);
+ }
+ }
+
+ if (cmd == HYPRLOFS_RM_ALL) {
+ return (hyprlofs_rm_all(vp, cr, ct, flag));
+ }
+
+ if (cmd == HYPRLOFS_GET_ENTRIES) {
+ return (hyprlofs_get_all(vp, data, cr, ct, flag));
+ }
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED2*/
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+
+ mutex_enter(&tp->hln_tlock);
+ vap->va_type = vp->v_type;
+ vap->va_mode = tp->hln_mode & MODEMASK;
+ vap->va_uid = tp->hln_uid;
+ vap->va_gid = tp->hln_gid;
+ vap->va_fsid = tp->hln_fsid;
+ vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+ vap->va_nlink = tp->hln_nlink;
+ vap->va_size = (u_offset_t)tp->hln_size;
+ vap->va_atime = tp->hln_atime;
+ vap->va_mtime = tp->hln_mtime;
+ vap->va_ctime = tp->hln_ctime;
+ vap->va_blksize = PAGESIZE;
+ vap->va_rdev = tp->hln_rdev;
+ vap->va_seq = tp->hln_seq;
+
+ vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+ mutex_exit(&tp->hln_tlock);
+ return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error = 0;
+ vattr_t *get;
+ long mask;
+
+ /*
+ * Cannot set these attributes
+ */
+ if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+ return (EINVAL);
+
+ mutex_enter(&tp->hln_tlock);
+
+ get = &tp->hln_attr;
+ /*
+ * Change file access modes. Must be owner or have sufficient
+ * privileges.
+ */
+ error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+ hyprlofs_taccess, tp);
+
+ if (error)
+ goto out;
+
+ mask = vap->va_mask;
+
+ if (mask & AT_MODE) {
+ get->va_mode &= S_IFMT;
+ get->va_mode |= vap->va_mode & ~S_IFMT;
+ }
+
+ if (mask & AT_UID)
+ get->va_uid = vap->va_uid;
+ if (mask & AT_GID)
+ get->va_gid = vap->va_gid;
+ if (mask & AT_ATIME)
+ get->va_atime = vap->va_atime;
+ if (mask & AT_MTIME)
+ get->va_mtime = vap->va_mtime;
+
+ if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+ gethrestime(&tp->hln_ctime);
+
+out:
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error;
+
+ if (mode & VWRITE) {
+ if (vp->v_type == VREG && vn_is_readonly(vp))
+ return (EROFS);
+ }
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+ mutex_enter(&tp->hln_tlock);
+ error = hyprlofs_taccess(tp, mode, cr);
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *ntp = NULL;
+ int error;
+
+ if (VTOHLN(dvp)->hln_looped == 1)
+ return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp));
+
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+ ASSERT(tp);
+
+ if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+ ASSERT(ntp);
+ *vpp = HLNTOV(ntp);
+ }
+ return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+ int mode, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *parent;
+ hlfsmount_t *tm;
+ int error;
+ hlnode_t *oldtp;
+ vnode_t *vp;
+
+ parent = (hlnode_t *)VTOHLN(dvp);
+ tm = (hlfsmount_t *)VTOHLM(dvp);
+ error = 0;
+ oldtp = NULL;
+
+ if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+ /* we don't support the sticky bit */
+ vap->va_mode &= ~VSVTX;
+ } else if (vap->va_type == VNON) {
+ return (EINVAL);
+ }
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ oldtp = parent;
+ } else {
+ error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+ }
+
+ if (error == 0) { /* name found */
+ ASSERT(oldtp);
+
+ rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+ /*
+ * if create/read-only an existing directory, allow it
+ */
+ if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+ error = EISDIR;
+ else {
+ error = hyprlofs_taccess(oldtp, mode, cr);
+ }
+
+ if (error) {
+ rw_exit(&oldtp->hln_rwlock);
+ hlnode_rele(oldtp);
+ return (error);
+ }
+
+ vp = HLNTOV(oldtp);
+ rw_exit(&oldtp->hln_rwlock);
+
+ if (vp->v_type == VREG) {
+ hlnode_rele(oldtp);
+ return (EEXIST);
+ }
+
+ vnevent_create(vp, ct);
+ return (0);
+ }
+
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+ cr);
+ rw_exit(&parent->hln_rwlock);
+
+ return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+ int error;
+
+ /*
+ * Might be dangling directory. Catch it here, because a ENOENT return
+ * from hyprlofs_dirlookup() is a valid return.
+ */
+ if (parent->hln_nlink == 0)
+ return (ENOENT);
+
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error == 0) {
+ ASSERT(self);
+ hlnode_rele(self);
+ /* We can't loop in under a looped in directory */
+ if (self->hln_looped)
+ return (EACCES);
+ *vpp = HLNTOV(self);
+ return (EEXIST);
+ }
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+ va, &self, cr);
+ rw_exit(&parent->hln_rwlock);
+
+ if (error == 0 || error == EEXIST) {
+ hlnode_rele(self);
+ *vpp = HLNTOV(self);
+ }
+
+ return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+ cred_t *cr, caller_context_t *ct)
+{
+ int error;
+ char *p, *pnm;
+ vnode_t *realvp, *dvp;
+ vattr_t va;
+
+ /* Get vnode for the real file/dir. */
+ if (error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, &realvp))
+ return (error);
+
+ /* no devices allowed */
+ if (IS_DEVVP(realvp))
+ return (ENODEV);
+
+ /*
+ * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+ * to trigger the mount of the intended filesystem. This causes a
+ * loopback mount of the intended filesystem instead of the AUTOFS
+ * filesystem.
+ */
+ if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ /*
+ * We're interested in the top most filesystem. This is specially
+ * important when fspath is a trigger AUTOFS node, since we're really
+ * interested in mounting the filesystem AUTOFS mounted as result of
+ * the VOP_ACCESS() call not the AUTOFS node itself.
+ */
+ if (vn_mountedvfs(realvp) != NULL) {
+ if (error = traverse(&realvp)) {
+ VN_RELE(realvp);
+ return (error);
+ }
+ }
+
+ va.va_type = VNON;
+ /*
+ * If the target name is a path, make sure we have all of the
+ * intermediate directories, creating them if necessary.
+ */
+ dvp = vp;
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/')
+ return (EINVAL);
+
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ if (va.va_type == VNON)
+ /* use the top-level dir as the template va for mkdir */
+ if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0)
+ return (error);
+
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.'))
+ return (EINVAL);
+
+ if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+ error != EEXIST)
+ return (error);
+
+ *p = '/';
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0')
+ return (EINVAL);
+
+ /* Now use the real file's va as the template va */
+ if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0)
+ return (error);
+
+ /* Make the vnode */
+ return (hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct));
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error;
+ char *p, *pnm;
+ hlnode_t *parent;
+ hlnode_t *fndtp;
+
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/')
+ return (EINVAL);
+
+ /*
+ * If the target name is a path, get the containing dir and simple
+ * file name.
+ */
+ parent = (hlnode_t *)VTOHLN(dvp);
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.'))
+ return (EINVAL);
+
+ if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+ return (error);
+
+ dvp = HLNTOV(fndtp);
+ parent = fndtp;
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0')
+ return (EINVAL);
+
+ /* Remove the entry from the parent dir */
+ return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error = 0;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively remove contents of this subdir */
+ if (fndhp->hln_type == VDIR) {
+ vnode_t *tvp = HLNTOV(fndhp);
+
+ error = hyprlofs_rm_all(tvp, cr, ct, flags);
+ if (error != 0)
+ goto done;
+ }
+ }
+
+ /* remove the entry */
+ error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+ if (error != 0)
+ goto done;
+
+ hdp = hp->hln_dir;
+ }
+
+done:
+ hlnode_rele(hp);
+ return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+ char *prefix, int *pcnt, int n_max,
+ cred_t *cr, caller_context_t *ct, int flags)
+{
+ int error = 0;
+ int too_big = 0;
+ int cnt;
+ int len;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+ char *path;
+
+ cnt = *pcnt;
+ path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+ vnode_t *tvp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively get contents of this subdir */
+ VERIFY(fndhp->hln_type == VDIR);
+ tvp = HLNTOV(fndhp);
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN, "%s/%s",
+ prefix, hdp->hld_name);
+
+ error = hyprlofs_get_all_entries(tvp, hcp, path,
+ &cnt, n_max, cr, ct, flags);
+
+ if (error == E2BIG) {
+ too_big = 1;
+ error = 0;
+ }
+ if (error != 0)
+ goto done;
+ } else {
+ if (cnt < n_max) {
+ char *p;
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name,
+ MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN,
+ "%s/%s", prefix, hdp->hld_name);
+
+ len = strlen(path);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(path, (void *)(hcp[cnt].hce_name),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+
+ tvp = REALVP(HLNTOV(fndhp));
+ if (tvp->v_path == NULL) {
+ p = "<unknown>";
+ } else {
+ p = tvp->v_path;
+ }
+ len = strlen(p);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(p, (void *)(hcp[cnt].hce_path),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+ }
+
+ cnt++;
+ if (cnt > n_max)
+ too_big = 1;
+ }
+
+ hdp = hdp->hld_next;
+ }
+
+done:
+ hlnode_rele(hp);
+ kmem_free(path, MAXPATHLEN);
+
+ *pcnt = cnt;
+ if (error == 0 && too_big == 1)
+ error = E2BIG;
+
+ return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int limit, cnt, error;
+ model_t model;
+ hyprlofs_curr_entry_t *e;
+
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ limit = ebuf.hce_cnt;
+ e = ebuf.hce_entries;
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ limit = ebuf32.hce_cnt;
+ e = (hyprlofs_curr_entry_t *)(unsigned long)
+ (ebuf32.hce_entries);
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ }
+
+ cnt = 0;
+ error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+ flags);
+
+ if (error == 0 || error == E2BIG) {
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ ebuf.hce_cnt = cnt;
+ if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+ return (EFAULT);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ ebuf32.hce_cnt = cnt;
+ if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+ return (EFAULT);
+ }
+ }
+
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ int error;
+ hlnode_t *hp = NULL;
+
+ /* This holds the hp vnode */
+ error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+ if (error)
+ return (error);
+
+ ASSERT(hp);
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+ rw_exit(&hp->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+ hlnode_rele(hp);
+
+ return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ vnode_t *vp;
+ int error = 0;
+
+ /* Return error if removing . or .. */
+ if (strcmp(nm, ".") == 0)
+ return (EINVAL);
+ if (strcmp(nm, "..") == 0)
+ return (EEXIST); /* Should be ENOTEMPTY */
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&self->hln_rwlock, RW_WRITER);
+
+ vp = HLNTOV(self);
+ if (vp == dvp || vp == cdir) {
+ error = EINVAL;
+ goto done1;
+ }
+ if (self->hln_type != VDIR) {
+ error = ENOTDIR;
+ goto done1;
+ }
+
+ /*
+ * When a dir is looped in, we only remove the in-memory dir, not the
+ * backing dir.
+ */
+ if (self->hln_looped == 0) {
+ mutex_enter(&self->hln_tlock);
+ if (self->hln_nlink > 2) {
+ mutex_exit(&self->hln_tlock);
+ error = EEXIST;
+ goto done1;
+ }
+ mutex_exit(&self->hln_tlock);
+
+ if (vn_vfswlock(vp)) {
+ error = EBUSY;
+ goto done1;
+ }
+ if (vn_mountedvfs(vp) != NULL) {
+ error = EBUSY;
+ goto done;
+ }
+
+ /*
+ * Check for an empty directory, i.e. only includes entries for
+ * "." and ".."
+ */
+ if (self->hln_dirents > 2) {
+ error = EEXIST; /* SIGH should be ENOTEMPTY */
+ /*
+ * Update atime because checking hln_dirents is
+ * equivalent to reading the directory
+ */
+ gethrestime(&self->hln_atime);
+ goto done;
+ }
+
+ error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+ } else {
+ error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+ }
+
+done:
+ if (self->hln_looped == 0)
+ vn_vfsunlock(vp);
+done1:
+ rw_exit(&self->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+ hlnode_rele(self);
+
+ return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hldirent_t *hdp;
+ int error = 0;
+ size_t namelen;
+ struct dirent64 *dp;
+ ulong_t offset;
+ ulong_t total_bytes_wanted;
+ long outcount = 0;
+ long bufsize;
+ int reclen;
+ caddr_t outbuf;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+ if (uiop->uio_loffset >= MAXOFF_T) {
+ if (eofp)
+ *eofp = 1;
+ return (0);
+ }
+ /* assuming syscall has already called hln_rwlock */
+ ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ return (0);
+ }
+
+ /* Get space for multiple dir entries */
+ total_bytes_wanted = uiop->uio_iov->iov_len;
+ bufsize = total_bytes_wanted + sizeof (struct dirent64);
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+ dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+ offset = 0;
+ hdp = hp->hln_dir;
+ while (hdp) {
+ namelen = strlen(hdp->hld_name); /* no +1 needed */
+ offset = hdp->hld_offset;
+ if (offset >= uiop->uio_offset) {
+ reclen = (int)DIRENT64_RECLEN(namelen);
+ if (outcount + reclen > total_bytes_wanted) {
+ if (!outcount)
+ /* Buffer too small for any entries. */
+ error = EINVAL;
+ break;
+ }
+ ASSERT(hdp->hld_hlnode != NULL);
+
+ /* zero out uninitialized bytes */
+ (void) strncpy(dp->d_name, hdp->hld_name,
+ DIRENT64_NAMELEN(reclen));
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+ dp->d_off = (offset_t)hdp->hld_offset + 1;
+ dp = (struct dirent64 *)
+ ((uintptr_t)dp + dp->d_reclen);
+ outcount += reclen;
+ ASSERT(outcount <= bufsize);
+ }
+ hdp = hdp->hld_next;
+ }
+
+ if (!error)
+ error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+ if (!error) {
+ /*
+ * If we reached the end of the list our offset should now be
+ * just past the end.
+ */
+ if (!hdp) {
+ offset += 1;
+ if (eofp)
+ *eofp = 1;
+ } else if (eofp)
+ *eofp = 0;
+ uiop->uio_offset = offset;
+ }
+ gethrestime(&hp->hln_atime);
+ kmem_free(outbuf, bufsize);
+ return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ mutex_enter(&hp->hln_tlock);
+ mutex_enter(&vp->v_lock);
+ ASSERT(vp->v_count >= 1);
+
+ /*
+ * If we don't have the last hold or the link count is non-zero,
+ * there's nothing to do except drop our hold.
+ */
+ if (vp->v_count > 1 || hp->hln_nlink != 0) {
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+ rw_exit(&hp->hln_rwlock);
+ return;
+ }
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+
+ /* Here's our chance to send invalid event while we're between locks */
+ vn_invalid(HLNTOV(hp));
+
+ mutex_enter(&hm->hlm_contents);
+ if (hp->hln_forw == NULL)
+ hm->hlm_rootnode->hln_back = hp->hln_back;
+ else
+ hp->hln_forw->hln_back = hp->hln_back;
+ hp->hln_back->hln_forw = hp->hln_forw;
+ mutex_exit(&hm->hlm_contents);
+ rw_exit(&hp->hln_rwlock);
+ rw_destroy(&hp->hln_rwlock);
+ mutex_destroy(&hp->hln_tlock);
+ vn_free(HLNTOV(hp));
+ hyprlofs_memfree(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfid_t *hfid;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FID(REALVP(vp), fidp, ct));
+
+ if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+ fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+ return (ENOSPC);
+ }
+
+ hfid = (hlfid_t *)fidp;
+ bzero(hfid, sizeof (hlfid_t));
+ hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+ hfid->hlfid_ino = hp->hln_nodeid;
+ hfid->hlfid_gen = hp->hln_gen;
+
+ return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+ cred_t *cr, caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+ rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+ cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ ASSERT(VTOHLN(vp)->hln_looped == 1);
+ return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1)
+ return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+ if (write_lock) {
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ } else {
+ rw_enter(&hp->hln_rwlock, RW_READER);
+ }
+ return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1) {
+ VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+ return;
+ }
+
+ rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+ switch (cmd) {
+ case _PC_XATTR_ENABLED:
+ case _PC_XATTR_EXISTS:
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ error = EINVAL;
+ break;
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ error = 0;
+ break;
+ default:
+ error = fs_pathconf(vp, cmd, valp, cr, ct);
+ }
+ return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = hyprlofs_open },
+ VOPNAME_CLOSE, { .vop_close = hyprlofs_close },
+ VOPNAME_READ, { .vop_read = hyprlofs_read },
+ VOPNAME_WRITE, { .vop_write = hyprlofs_write },
+ VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr },
+ VOPNAME_ACCESS, { .vop_access = hyprlofs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup },
+ VOPNAME_CREATE, { .error = fs_error },
+ VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove },
+ VOPNAME_LINK, { .error = fs_error },
+ VOPNAME_RENAME, { .error = fs_error },
+ VOPNAME_MKDIR, { .error = fs_error },
+ VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir },
+ VOPNAME_SYMLINK, { .error = fs_error },
+ VOPNAME_READLINK, { .error = fs_error },
+ VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive },
+ VOPNAME_FID, { .vop_fid = hyprlofs_fid },
+ VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = hyprlofs_seek },
+ VOPNAME_SPACE, { .vop_space = hyprlofs_space },
+ VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage },
+ VOPNAME_MAP, { .vop_map = hyprlofs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..c6cfdd9d6a
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,515 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+ uio_t *uiop;
+ char *buffer;
+ uint32_t buffsize;
+ char *pos;
+ size_t beg;
+ int error;
+};
+
+#define BUFSIZE 4000
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+ /* Allocate memory for both lxpr_uiobuf and output buffer */
+ struct lxpr_uiobuf *uiobuf =
+ kmem_alloc(sizeof (struct lxpr_uiobuf) + BUFSIZE, KM_SLEEP);
+
+ uiobuf->uiop = uiop;
+ uiobuf->buffer = (char *)&uiobuf[1];
+ uiobuf->buffsize = BUFSIZE;
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+ uiobuf->error = 0;
+
+ return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+ ASSERT(uiobuf != NULL);
+ ASSERT(uiobuf->pos == uiobuf->buffer);
+
+ kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+ uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+ ASSERT(uiobuf->error == 0);
+
+ uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+ off_t off = uiobuf->uiop->uio_offset;
+ caddr_t uaddr = uiobuf->buffer;
+ size_t beg = uiobuf->beg;
+ size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+ if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ ASSERT(off >= beg);
+
+ if (beg + size > off && off >= 0)
+ uiobuf->error =
+ uiomove(uaddr + (off - beg), size - (off - beg),
+ UIO_READ, uiobuf->uiop);
+
+ uiobuf->beg += size;
+ }
+
+ uiobuf->pos = uaddr;
+
+ return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+ /* While we can still carry on */
+ while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+ (uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer;
+
+ /* Enough space in buffer? */
+ if (remain >= size) {
+ bcopy(buf, uiobuf->pos, size);
+ uiobuf->pos += size;
+ return;
+ }
+
+ /* Not enough space, so copy all we can and try again */
+ bcopy(buf, uiobuf->pos, remain);
+ uiobuf->pos += remain;
+ (void) lxpr_uiobuf_flush(uiobuf);
+ buf += remain;
+ size -= remain;
+ }
+}
+
+#define TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+ va_list args;
+ char buff[TYPBUFFSIZE];
+ int len;
+ char *buffer;
+
+ /* Can we still do any output */
+ if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+ return;
+
+ va_start(args, fmt);
+
+ /* Try using stack allocated buffer */
+ len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+ if (len < TYPBUFFSIZE) {
+ va_end(args);
+ lxpr_uiobuf_write(uiobuf, buff, len);
+ return;
+ }
+
+ /* Not enough space in pre-allocated buffer */
+ buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+ /*
+ * We know we allocated the correct amount of space
+ * so no check on the return value
+ */
+ (void) vsnprintf(buffer, len+1, fmt, args);
+ lxpr_uiobuf_write(uiobuf, buffer, len);
+ va_end(args);
+ kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+ proc_t *p;
+ kmutex_t *mp;
+
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ for (;;) {
+ mutex_enter(&pidlock);
+
+ /*
+ * If the pid is 1, we really want the zone's init process
+ */
+ p = prfind((pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (NULL);
+ }
+
+ /*
+ * p_lock is persistent, but p itself is not -- it could
+ * vanish during cv_wait(). Load p->p_lock now so we can
+ * drop it after cv_wait() without referencing p.
+ */
+ mp = &p->p_lock;
+ mutex_enter(mp);
+
+ mutex_exit(&pidlock);
+
+ if (!(p->p_proc_flag & P_PR_LOCK))
+ break;
+
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+ }
+
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+ return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ mutex_exit(&p->p_lock);
+ THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+ lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+ sizeof (lxpr_node_t), 0,
+ lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+ kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+ lxpr_node_t *lxpnp = buf;
+ vnode_t *vp;
+
+ vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+ if (vp == NULL)
+ return (-1);
+
+ (void) vn_setops(vp, lxpr_vnodeops);
+ vp->v_data = lxpnp;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+ lxpr_node_t *lxpnp = buf;
+
+ vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+ if (pid == 1)
+ pid = curproc->p_zone->zone_proc_initpid;
+
+ switch (type) {
+ case LXPR_PIDDIR:
+ return (pid + 1);
+ case LXPR_PROCDIR:
+ return (maxpid + 2);
+ case LXPR_PID_FD_FD:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ LXPR_NFILES + fd);
+ default:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ type);
+ }
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+ /*
+ * If the input node is the root then the parent inode
+ * is the mounted on inode so just return our inode number
+ */
+ if (lxpnp->lxpr_type != LXPR_PROCDIR)
+ return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+ else
+ return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+ lxpr_node_t *lxpnp;
+ vnode_t *vp;
+ user_t *up;
+ timestruc_t now;
+
+ /*
+ * Allocate a new node. It is deallocated in vop_innactive
+ */
+ lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+ /*
+ * Set defaults (may be overridden below)
+ */
+ gethrestime(&now);
+ lxpnp->lxpr_type = type;
+ lxpnp->lxpr_realvp = NULL;
+ lxpnp->lxpr_parent = dp;
+ VN_HOLD(dp);
+ if (p != NULL) {
+ lxpnp->lxpr_pid = ((p->p_pid ==
+ curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+ lxpnp->lxpr_time = PTOU(p)->u_start;
+ lxpnp->lxpr_uid = crgetruid(p->p_cred);
+ lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+ lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+ } else {
+ /* Pretend files without a proc belong to sched */
+ lxpnp->lxpr_pid = 0;
+ lxpnp->lxpr_time = now;
+ lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+ lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+ }
+
+ /* initialize the vnode data */
+ vp = lxpnp->lxpr_vnode;
+ vn_reinit(vp);
+ vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+ vp->v_vfsp = dp->v_vfsp;
+
+ /*
+ * Do node specific stuff
+ */
+ switch (type) {
+ case LXPR_PROCDIR:
+ vp->v_flag |= VROOT;
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_CURDIR:
+ ASSERT(p != NULL);
+
+ /*
+ * Zombie check. p_stat is officially protected by pidlock,
+ * but we can't grab pidlock here because we already hold
+ * p_lock. Luckily if we look at the process exit code
+ * we see that p_stat only transisions from SRUN to SZOMB
+ * while p_lock is held. Aside from this, the only other
+ * p_stat transition that we need to be aware about is
+ * SIDL to SRUN, but that's not a problem since lxpr_lock()
+ * ignores nodes in the SIDL state so we'll never get a node
+ * that isn't already in the SRUN state.
+ */
+ if (p->p_stat == SZOMB) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ up = PTOU(p);
+ lxpnp->lxpr_realvp = up->u_cdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_ROOTDIR:
+ ASSERT(p != NULL);
+ /* Zombie check. see locking comment above */
+ if (p->p_stat == SZOMB) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ up = PTOU(p);
+ lxpnp->lxpr_realvp =
+ up->u_rdir != NULL ? up->u_rdir : rootdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_EXE:
+ ASSERT(p != NULL);
+ lxpnp->lxpr_realvp = p->p_exec;
+ if (lxpnp->lxpr_realvp != NULL) {
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777;
+ break;
+
+ case LXPR_SELF:
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_FD_FD:
+ ASSERT(p != NULL);
+ /* lxpr_realvp is set after we return */
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */
+ break;
+
+ case LXPR_PID_FDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0500; /* read-search by owner only */
+ break;
+
+ case LXPR_PIDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0511;
+ break;
+
+ case LXPR_NETDIR:
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by all */
+ break;
+
+ case LXPR_PID_ENV:
+ case LXPR_PID_MEM:
+ ASSERT(p != NULL);
+ /*FALLTHRU*/
+ case LXPR_KCORE:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0400; /* read-only by owner only */
+ break;
+
+ default:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0444; /* read-only by all */
+ break;
+ }
+
+ return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+ ASSERT(lxpnp != NULL);
+ ASSERT(LXPTOV(lxpnp) != NULL);
+
+ /*
+ * delete any association with realvp
+ */
+ if (lxpnp->lxpr_realvp != NULL)
+ VN_RELE(lxpnp->lxpr_realvp);
+
+ /*
+ * delete any association with parent vp
+ */
+ if (lxpnp->lxpr_parent != NULL)
+ VN_RELE(lxpnp->lxpr_parent);
+
+ /*
+ * Release the lxprnode.
+ */
+ kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int lxprocfstype;
+static dev_t lxprocdev;
+static kmutex_t lxpr_mount_lock;
+
+int nproc_highbit; /* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lxproc",
+ lxpr_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int retval;
+
+ /*
+ * attempt to unload the module
+ */
+ if ((retval = mod_remove(&modlinkage)) != 0)
+ goto done;
+
+ /*
+ * destroy lxpr_node cache
+ */
+ lxpr_fininodecache();
+
+ /*
+ * clean out the vfsops and vnodeops
+ */
+ (void) vfs_freevfsops_by_type(lxprocfstype);
+ vn_freevnodeops(lxpr_vnodeops);
+
+ mutex_destroy(&lxpr_mount_lock);
+done:
+ return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxpr_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxpr_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxpr_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs },
+ NULL, NULL
+ };
+ extern const fs_operation_def_t lxpr_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ nproc_highbit = highbit(v.v_proc);
+ lxprocfstype = fstype;
+ ASSERT(lxprocfstype != 0);
+
+ mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Associate VFS ops vector with this fstype.
+ */
+ error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+ return (error);
+ }
+
+ /*
+ * Set up vnode ops vector too.
+ */
+ error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * Assign a unique "device" number (used by stat(2)).
+ */
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxprocdev = makedevice(dev, 0);
+
+ /*
+ * Initialize cache for lxpr_nodes
+ */
+ lxpr_initnodecache();
+
+ return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt;
+ zone_t *zone = curproc->p_zone;
+ ldi_ident_t li;
+ int err;
+
+ /*
+ * must be root to mount
+ */
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ /*
+ * mount point must be a directory
+ */
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (zone == global_zone) {
+ zone_t *mntzone;
+
+ mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+ zone_rele(mntzone);
+ if (zone != mntzone)
+ return (EBUSY);
+ }
+
+ /*
+ * Having the resource be anything but "lxproc" doesn't make sense
+ */
+ vfs_setresource(vfsp, "lxproc", 0);
+
+ lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+ if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+ return (err);
+ }
+
+ lxpr_mnt->lxprm_li = li;
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ mutex_exit(&lxpr_mount_lock);
+ kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * allocate the first vnode
+ */
+ zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+ /* Arbitrarily set the parent vnode to the mounted over directory */
+ lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+ /* Correctly set the fs for the root node */
+ lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+ vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lxprocfstype;
+ vfsp->vfs_data = (caddr_t)lxpr_mnt;
+ vfsp->vfs_dev = lxprocdev;
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+ vnode_t *vp;
+ int count;
+
+ ASSERT(lxpr_mnt != NULL);
+ vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * must be root to unmount
+ */
+ if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EPERM);
+ }
+
+ /*
+ * forced unmount is not supported by this file system
+ */
+ if (flag & MS_FORCE) {
+ mutex_exit(&lxpr_mount_lock);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Ensure that no vnodes are in use on this mount point.
+ */
+ mutex_enter(&vp->v_lock);
+ count = vp->v_count;
+ mutex_exit(&vp->v_lock);
+ if (count > 1) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * purge the dnlc cache for vnode entries
+ * associated with this file system
+ */
+ count = dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * free up the lxprnode
+ */
+ lxpr_freenode(lxpr_mnt->lxprm_node);
+ zone_rele(lxpr_mnt->lxprm_zone);
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+ vnode_t *vp = LXPTOV(lxpnp);
+
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ int n;
+ dev32_t d32;
+ extern uint_t nproc;
+
+ n = v.v_proc - nproc;
+
+ bzero((caddr_t)sp, sizeof (*sp));
+ sp->f_bsize = DEV_BSIZE;
+ sp->f_frsize = DEV_BSIZE;
+ sp->f_blocks = (fsblkcnt64_t)0;
+ sp->f_bfree = (fsblkcnt64_t)0;
+ sp->f_bavail = (fsblkcnt64_t)0;
+ sp->f_files = (fsfilcnt64_t)v.v_proc + 2;
+ sp->f_ffree = (fsfilcnt64_t)n;
+ sp->f_favail = (fsfilcnt64_t)n;
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ /* It is guaranteed that vsw_name will fit in f_basetype */
+ (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sp->f_namemax = 64; /* quite arbitrary */
+
+ (void) strcpy(sp->f_fstr, "lxproc");
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..bd45e84311
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3077 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * The aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth. A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space. (That is, each thread has in
+ * effect a pid.) Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all. Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility. In short, this is meant to be a best
+ * effort -- no more.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+ pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+ pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+ caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define btok(x) ((x) >> 10) /* bytes to kbytes */
+#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxpr_open },
+ VOPNAME_CLOSE, { .vop_close = lxpr_close },
+ VOPNAME_READ, { .vop_read = lxpr_read },
+ VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr },
+ VOPNAME_ACCESS, { .vop_access = lxpr_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup },
+ VOPNAME_READDIR, { .vop_readdir = lxpr_readdir },
+ VOPNAME_READLINK, { .vop_readlink = lxpr_readlink },
+ VOPNAME_FSYNC, { .error = lxpr_sync },
+ VOPNAME_SEEK, { .error = lxpr_sync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive },
+ VOPNAME_CMP, { .vop_cmp = lxpr_cmp },
+ VOPNAME_REALVP, { .vop_realvp = lxpr_realvp },
+ NULL, NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+ { LXPR_CMDLINE, "cmdline" },
+ { LXPR_CPUINFO, "cpuinfo" },
+ { LXPR_DEVICES, "devices" },
+ { LXPR_DMA, "dma" },
+ { LXPR_FILESYSTEMS, "filesystems" },
+ { LXPR_INTERRUPTS, "interrupts" },
+ { LXPR_IOPORTS, "ioports" },
+ { LXPR_KCORE, "kcore" },
+ { LXPR_KMSG, "kmsg" },
+ { LXPR_LOADAVG, "loadavg" },
+ { LXPR_MEMINFO, "meminfo" },
+ { LXPR_MOUNTS, "mounts" },
+ { LXPR_NETDIR, "net" },
+ { LXPR_PARTITIONS, "partitions" },
+ { LXPR_SELF, "self" },
+ { LXPR_STAT, "stat" },
+ { LXPR_UPTIME, "uptime" },
+ { LXPR_VERSION, "version" }
+};
+
+#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+ { LXPR_PID_CMDLINE, "cmdline" },
+ { LXPR_PID_CPU, "cpu" },
+ { LXPR_PID_CURDIR, "cwd" },
+ { LXPR_PID_ENV, "environ" },
+ { LXPR_PID_EXE, "exe" },
+ { LXPR_PID_MAPS, "maps" },
+ { LXPR_PID_MEM, "mem" },
+ { LXPR_PID_ROOTDIR, "root" },
+ { LXPR_PID_STAT, "stat" },
+ { LXPR_PID_STATM, "statm" },
+ { LXPR_PID_STATUS, "status" },
+ { LXPR_PID_FDDIR, "fd" }
+};
+
+#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+ { LXPR_NET_ARP, "arp" },
+ { LXPR_NET_DEV, "dev" },
+ { LXPR_NET_DEV_MCAST, "dev_mcast" },
+ { LXPR_NET_IGMP, "igmp" },
+ { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" },
+ { LXPR_NET_IP_MR_VIF, "ip_mr_vif" },
+ { LXPR_NET_MCFILTER, "mcfilter" },
+ { LXPR_NET_NETSTAT, "netstat" },
+ { LXPR_NET_RAW, "raw" },
+ { LXPR_NET_ROUTE, "route" },
+ { LXPR_NET_RPC, "rpc" },
+ { LXPR_NET_RT_CACHE, "rt_cache" },
+ { LXPR_NET_SOCKSTAT, "sockstat" },
+ { LXPR_NET_SNMP, "snmp" },
+ { LXPR_NET_STAT, "stat" },
+ { LXPR_NET_TCP, "tcp" },
+ { LXPR_NET_UDP, "udp" },
+ { LXPR_NET_UNIX, "unix" }
+};
+
+#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * ====================================
+ * | Number | Linux | Native |
+ * | ====== | ========= | ========== |
+ * | 7 | SIGBUS | SIGEMT |
+ * | 10 | SIGUSR1 | SIGBUS |
+ * | 12 | SIGUSR2 | SIGSYS |
+ * | 16 | SIGSTKFLT | SIGUSR1 |
+ * | 17 | SIGCHLD | SIGUSR2 |
+ * | 18 | SIGCONT | SIGCHLD |
+ * | 19 | SIGSTOP | SIGPWR |
+ * | 20 | SIGTSTP | SIGWINCH |
+ * | 21 | SIGTTIN | SIGURG |
+ * | 22 | SIGTTOU | SIGPOLL |
+ * | 23 | SIGURG | SIGSTOP |
+ * | 24 | SIGXCPU | SIGTSTP |
+ * | 25 | SIGXFSZ | SIGCONT |
+ * | 26 | SIGVTALARM | SIGTTIN |
+ * | 27 | SIGPROF | SIGTTOU |
+ * | 28 | SIGWINCH | SIGVTALARM |
+ * | 29 | SIGPOLL | SIGPROF |
+ * | 30 | SIGPWR | SIGXCPU |
+ * | 31 | SIGSYS | SIGXFSZ |
+ * ====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+ 0,
+ LX_SIGHUP,
+ LX_SIGINT,
+ LX_SIGQUIT,
+ LX_SIGILL,
+ LX_SIGTRAP,
+ LX_SIGABRT,
+ LX_SIGSTKFLT,
+ LX_SIGFPE,
+ LX_SIGKILL,
+ LX_SIGBUS,
+ LX_SIGSEGV,
+ LX_SIGSYS,
+ LX_SIGPIPE,
+ LX_SIGALRM,
+ LX_SIGTERM,
+ LX_SIGUSR1,
+ LX_SIGUSR2,
+ LX_SIGCHLD,
+ LX_SIGPWR,
+ LX_SIGWINCH,
+ LX_SIGURG,
+ LX_SIGPOLL,
+ LX_SIGSTOP,
+ LX_SIGTSTP,
+ LX_SIGCONT,
+ LX_SIGTTIN,
+ LX_SIGTTOU,
+ LX_SIGVTALRM,
+ LX_SIGPROF,
+ LX_SIGXCPU,
+ LX_SIGXFSZ,
+ -1, /* 32: illumos SIGWAITING */
+ -1, /* 33: illumos SIGLWP */
+ -1, /* 34: illumos SIGFREEZE */
+ -1, /* 35: illumos SIGTHAW */
+ -1, /* 36: illumos SIGCANCEL */
+ -1, /* 37: illumos SIGLOST */
+ -1, /* 38: illumos SIGXRES */
+ -1, /* 39: illumos SIGJVM1 */
+ -1, /* 40: illumos SIGJVM2 */
+ LX_SIGRTMIN, /* 41: illumos _SIGRTMIN */
+ LX_SIGRTMIN + 1,
+ LX_SIGRTMIN + 2,
+ LX_SIGRTMIN + 3,
+ LX_SIGRTMIN + 4,
+ LX_SIGRTMIN + 5,
+ LX_SIGRTMIN + 6,
+ LX_SIGRTMIN + 7,
+ LX_SIGRTMIN + 8,
+ LX_SIGRTMIN + 9,
+ LX_SIGRTMIN + 10,
+ LX_SIGRTMIN + 11,
+ LX_SIGRTMIN + 12,
+ LX_SIGRTMIN + 13,
+ LX_SIGRTMIN + 14,
+ LX_SIGRTMIN + 15,
+ LX_SIGRTMIN + 16,
+ LX_SIGRTMIN + 17,
+ LX_SIGRTMIN + 18,
+ LX_SIGRTMIN + 19,
+ LX_SIGRTMIN + 20,
+ LX_SIGRTMIN + 21,
+ LX_SIGRTMIN + 22,
+ LX_SIGRTMIN + 23,
+ LX_SIGRTMIN + 24,
+ LX_SIGRTMIN + 25,
+ LX_SIGRTMIN + 26,
+ LX_SIGRTMIN + 27,
+ LX_SIGRTMIN + 28,
+ LX_SIGRTMIN + 29,
+ LX_SIGRTMIN + 30,
+ LX_SIGRTMAX,
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *vp = *vpp;
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ vnode_t *rvp;
+ int error = 0;
+
+ /*
+ * We only allow reading in this file systrem
+ */
+ if (flag & FWRITE)
+ return (EROFS);
+
+ /*
+ * If we are opening an underlying file only allow regular files
+ * reject the open for anything but a regular file.
+ * Just do it if we are opening the current or root directory.
+ */
+ if (lxpnp->lxpr_realvp != NULL) {
+ rvp = lxpnp->lxpr_realvp;
+
+ if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+ error = EACCES;
+ else {
+ /*
+ * Need to hold rvp since VOP_OPEN() may release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ if (error) {
+ VN_RELE(rvp);
+ } else {
+ *vpp = rvp;
+ VN_RELE(vp);
+ }
+ }
+ }
+
+ if (type == LXPR_KMSG) {
+ ldi_ident_t li = VTOLXPM(vp)->lxprm_li;
+ struct strioctl str;
+ int rv;
+
+ /*
+ * Open the zone's console device using the layered driver
+ * interface.
+ */
+ if ((error = ldi_open_by_name("/dev/log", FREAD, cr,
+ &lxpnp->lxpr_cons_ldih, li)) != 0)
+ return (error);
+
+ /*
+ * Send an ioctl to the underlying console device, letting it
+ * know we're interested in getting console messages.
+ */
+ str.ic_cmd = I_CONSLOG;
+ str.ic_timout = 0;
+ str.ic_len = 0;
+ str.ic_dp = NULL;
+ if ((error = ldi_ioctl(lxpnp->lxpr_cons_ldih, I_STR,
+ (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+ return (error);
+ }
+
+ return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpr = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpr->lxpr_type;
+ int err;
+
+ /*
+ * we should never get here because the close is done on the realvp
+ * for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR &&
+ type != LXPR_PID_EXE);
+
+ if (type == LXPR_KMSG) {
+ if ((err = ldi_close(lxpr->lxpr_cons_ldih, 0, cr)) != 0)
+ return (err);
+ }
+
+ return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+ lxpr_read_isdir, /* /proc */
+ lxpr_read_isdir, /* /proc/<pid> */
+ lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */
+ lxpr_read_empty, /* /proc/<pid>/cpu */
+ lxpr_read_invalid, /* /proc/<pid>/cwd */
+ lxpr_read_empty, /* /proc/<pid>/environ */
+ lxpr_read_invalid, /* /proc/<pid>/exe */
+ lxpr_read_pid_maps, /* /proc/<pid>/maps */
+ lxpr_read_empty, /* /proc/<pid>/mem */
+ lxpr_read_invalid, /* /proc/<pid>/root */
+ lxpr_read_pid_stat, /* /proc/<pid>/stat */
+ lxpr_read_pid_statm, /* /proc/<pid>/statm */
+ lxpr_read_pid_status, /* /proc/<pid>/status */
+ lxpr_read_isdir, /* /proc/<pid>/fd */
+ lxpr_read_fd, /* /proc/<pid>/fd/nn */
+ lxpr_read_empty, /* /proc/cmdline */
+ lxpr_read_cpuinfo, /* /proc/cpuinfo */
+ lxpr_read_empty, /* /proc/devices */
+ lxpr_read_empty, /* /proc/dma */
+ lxpr_read_empty, /* /proc/filesystems */
+ lxpr_read_empty, /* /proc/interrupts */
+ lxpr_read_empty, /* /proc/ioports */
+ lxpr_read_empty, /* /proc/kcore */
+ lxpr_read_kmsg, /* /proc/kmsg */
+ lxpr_read_loadavg, /* /proc/loadavg */
+ lxpr_read_meminfo, /* /proc/meminfo */
+ lxpr_read_mounts, /* /proc/mounts */
+ lxpr_read_isdir, /* /proc/net */
+ lxpr_read_net_arp, /* /proc/net/arp */
+ lxpr_read_net_dev, /* /proc/net/dev */
+ lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */
+ lxpr_read_net_igmp, /* /proc/net/igmp */
+ lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */
+ lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */
+ lxpr_read_net_mcfilter, /* /proc/net/mcfilter */
+ lxpr_read_net_netstat, /* /proc/net/netstat */
+ lxpr_read_net_raw, /* /proc/net/raw */
+ lxpr_read_net_route, /* /proc/net/route */
+ lxpr_read_net_rpc, /* /proc/net/rpc */
+ lxpr_read_net_rt_cache, /* /proc/net/rt_cache */
+ lxpr_read_net_sockstat, /* /proc/net/sockstat */
+ lxpr_read_net_snmp, /* /proc/net/snmp */
+ lxpr_read_net_stat, /* /proc/net/stat */
+ lxpr_read_net_tcp, /* /proc/net/tcp */
+ lxpr_read_net_udp, /* /proc/net/udp */
+ lxpr_read_net_unix, /* /proc/net/unix */
+ lxpr_read_partitions, /* /proc/partitions */
+ lxpr_read_invalid, /* /proc/self */
+ lxpr_read_stat, /* /proc/stat */
+ lxpr_read_uptime, /* /proc/uptime */
+ lxpr_read_version, /* /proc/version */
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+ lxpr_lookup_procdir, /* /proc */
+ lxpr_lookup_piddir, /* /proc/<pid> */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/root */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/status */
+ lxpr_lookup_fddir, /* /proc/<pid>/fd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_lookup_not_a_dir, /* /proc/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/cpuinfo */
+ lxpr_lookup_not_a_dir, /* /proc/devices */
+ lxpr_lookup_not_a_dir, /* /proc/dma */
+ lxpr_lookup_not_a_dir, /* /proc/filesystems */
+ lxpr_lookup_not_a_dir, /* /proc/interrupts */
+ lxpr_lookup_not_a_dir, /* /proc/ioports */
+ lxpr_lookup_not_a_dir, /* /proc/kcore */
+ lxpr_lookup_not_a_dir, /* /proc/kmsg */
+ lxpr_lookup_not_a_dir, /* /proc/loadavg */
+ lxpr_lookup_not_a_dir, /* /proc/meminfo */
+ lxpr_lookup_not_a_dir, /* /proc/mounts */
+ lxpr_lookup_netdir, /* /proc/net */
+ lxpr_lookup_not_a_dir, /* /proc/net/arp */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_lookup_not_a_dir, /* /proc/net/igmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_lookup_not_a_dir, /* /proc/net/netstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/raw */
+ lxpr_lookup_not_a_dir, /* /proc/net/route */
+ lxpr_lookup_not_a_dir, /* /proc/net/rpc */
+ lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/sockstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/snmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/stat */
+ lxpr_lookup_not_a_dir, /* /proc/net/tcp */
+ lxpr_lookup_not_a_dir, /* /proc/net/udp */
+ lxpr_lookup_not_a_dir, /* /proc/net/unix */
+ lxpr_lookup_not_a_dir, /* /proc/partitions */
+ lxpr_lookup_not_a_dir, /* /proc/self */
+ lxpr_lookup_not_a_dir, /* /proc/stat */
+ lxpr_lookup_not_a_dir, /* /proc/uptime */
+ lxpr_lookup_not_a_dir, /* /proc/version */
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+ lxpr_readdir_procdir, /* /proc */
+ lxpr_readdir_piddir, /* /proc/<pid> */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/root */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/status */
+ lxpr_readdir_fddir, /* /proc/<pid>/fd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_readdir_not_a_dir, /* /proc/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/cpuinfo */
+ lxpr_readdir_not_a_dir, /* /proc/devices */
+ lxpr_readdir_not_a_dir, /* /proc/dma */
+ lxpr_readdir_not_a_dir, /* /proc/filesystems */
+ lxpr_readdir_not_a_dir, /* /proc/interrupts */
+ lxpr_readdir_not_a_dir, /* /proc/ioports */
+ lxpr_readdir_not_a_dir, /* /proc/kcore */
+ lxpr_readdir_not_a_dir, /* /proc/kmsg */
+ lxpr_readdir_not_a_dir, /* /proc/loadavg */
+ lxpr_readdir_not_a_dir, /* /proc/meminfo */
+ lxpr_readdir_not_a_dir, /* /proc/mounts */
+ lxpr_readdir_netdir, /* /proc/net */
+ lxpr_readdir_not_a_dir, /* /proc/net/arp */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_readdir_not_a_dir, /* /proc/net/igmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_readdir_not_a_dir, /* /proc/net/netstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/raw */
+ lxpr_readdir_not_a_dir, /* /proc/net/route */
+ lxpr_readdir_not_a_dir, /* /proc/net/rpc */
+ lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/sockstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/snmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/stat */
+ lxpr_readdir_not_a_dir, /* /proc/net/tcp */
+ lxpr_readdir_not_a_dir, /* /proc/net/udp */
+ lxpr_readdir_not_a_dir, /* /proc/net/unix */
+ lxpr_readdir_not_a_dir, /* /proc/partitions */
+ lxpr_readdir_not_a_dir, /* /proc/self */
+ lxpr_readdir_not_a_dir, /* /proc/stat */
+ lxpr_readdir_not_a_dir, /* /proc/uptime */
+ lxpr_readdir_not_a_dir, /* /proc/version */
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+ int error;
+
+ ASSERT(type >= 0 && type < LXPR_NFILES);
+
+ lxpr_read_function[type](lxpnp, uiobuf);
+
+ error = lxpr_uiobuf_flush(uiobuf);
+ lxpr_uiobuf_free(uiobuf);
+
+ return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ * but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context. This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ char *buf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+ lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+ lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ struct seg *seg;
+ char *buf;
+ int buflen = MAXPATHLEN;
+ struct print_data {
+ caddr_t saddr;
+ caddr_t eaddr;
+ int type;
+ char prot[5];
+ uint32_t offset;
+ vnode_t *vp;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *pbuf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ if (as == &kas) {
+ lxpr_unlock(p);
+ return;
+ }
+
+ mutex_exit(&p->p_lock);
+
+ /* Iterate over all segments in the address space */
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ vnode_t *vp;
+ uint_t protbits;
+
+ pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+ pbuf->saddr = seg->s_base;
+ pbuf->eaddr = seg->s_base+seg->s_size;
+ pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+ /*
+ * Cheat and only use the protection bits of the first page
+ * in the segment
+ */
+ (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+ (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+ if (protbits & PROT_READ) pbuf->prot[0] = 'r';
+ if (protbits & PROT_WRITE) pbuf->prot[1] = 'w';
+ if (protbits & PROT_EXEC) pbuf->prot[2] = 'x';
+ if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's';
+ else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG) {
+ VN_HOLD(vp);
+ pbuf->vp = vp;
+ } else {
+ pbuf->vp = NULL;
+ }
+
+ pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+ pbuf->next = NULL;
+ *print_tail = pbuf;
+ print_tail = &pbuf->next;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ /* print the data we've extracted */
+ pbuf = print_head;
+ while (pbuf != NULL) {
+ struct print_data *pbuf_next;
+ vattr_t vattr;
+
+ int maj = 0;
+ int min = 0;
+ u_longlong_t inode = 0;
+
+ *buf = '\0';
+ if (pbuf->vp != NULL) {
+ vattr.va_mask = AT_FSID | AT_NODEID;
+ if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+ NULL) == 0) {
+ maj = getmajor(vattr.va_fsid);
+ min = getminor(vattr.va_fsid);
+ inode = vattr.va_nodeid;
+ }
+ (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+ VN_RELE(pbuf->vp);
+ }
+
+ if (*buf != '\0') {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode, buf);
+ } else {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode);
+ }
+
+ pbuf_next = pbuf->next;
+ kmem_free(pbuf, sizeof (*pbuf));
+ pbuf = pbuf_next;
+ }
+
+ kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ size_t vsize;
+ size_t rss;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ mutex_exit(&p->p_lock);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = btopr(as->a_resvsize);
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%lu %lu %lu %lu %lu %lu %lu\n",
+ vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ user_t *up;
+ cred_t *cr;
+ const gid_t *groups;
+ int ngroups;
+ struct as *as;
+ char *status;
+ pid_t pid, ppid;
+ size_t vsize;
+ size_t rss;
+ k_sigset_t current, ignore, handle;
+ int i, lx_sig;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's init
+ * process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1;
+ ppid = 0; /* parent pid for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP)
+ ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ status = "S (sleeping)";
+ break;
+ case TS_RUN:
+ case TS_ONPROC:
+ status = "R (running)";
+ break;
+ case TS_ZOMB:
+ status = "Z (zombie)";
+ break;
+ case TS_STOPPED:
+ status = "T (stopped)";
+ break;
+ default:
+ status = "! (unknown)";
+ break;
+ }
+ thread_unlock(t);
+ } else {
+ /*
+ * there is a hole in the exit code, where a proc can have
+ * no threads but it is yet to be flagged SZOMB. We will
+ * assume we are about to become a zombie
+ */
+ status = "Z (zombie)";
+ }
+
+ up = PTOU(p);
+ mutex_enter(&p->p_crlock);
+ crhold(cr = p->p_cred);
+ mutex_exit(&p->p_crlock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "Name:\t%s\n"
+ "State:\t%s\n"
+ "Tgid:\t%d\n"
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
+ "Uid:\t%u\t%u\t%u\t%u\n"
+ "Gid:\t%u\t%u\t%u\t%u\n"
+ "FDSize:\t%d\n"
+ "Groups:\t",
+ up->u_comm,
+ status,
+ pid, /* thread group id - same as pid */
+ pid,
+ ppid,
+ 0,
+ crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+ crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+ p->p_fno_ctl);
+
+ ngroups = crgetngroups(cr);
+ groups = crgetgroups(cr);
+ for (i = 0; i < ngroups; i++) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%u ",
+ groups[i]);
+ }
+ crfree(cr);
+
+ as = p->p_as;
+ if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+ "VmRSS:\t%8lu kB\n"
+ "VmData:\t%8lu kB\n"
+ "VmStk:\t%8lu kB\n"
+ "VmExe:\t%8lu kB\n"
+ "VmLib:\t%8lu kB",
+ btok(vsize),
+ 0l,
+ ptok(rss),
+ 0l,
+ btok(p->p_stksize),
+ ptok(rss),
+ 0l);
+ }
+
+ sigemptyset(&current);
+ sigemptyset(&ignore);
+ sigemptyset(&handle);
+
+ for (i = 1; i < NSIG; i++) {
+ lx_sig = lxpr_sigmap[i];
+
+ if ((lx_sig > 0) && (lx_sig < LX_NSIG)) {
+ if (sigismember(&p->p_sig, i))
+ sigaddset(&current, lx_sig);
+
+ if (up->u_signal[i - 1] == SIG_IGN)
+ sigaddset(&ignore, lx_sig);
+ else if (up->u_signal[i - 1] != SIG_DFL)
+ sigaddset(&handle, lx_sig);
+ }
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "SigPnd:\t%08x%08x\n"
+ "SigBlk:\t%08x%08x\n"
+ "SigIgn:\t%08x%08x\n"
+ "SigCgt:\t%08x%08x\n"
+ "CapInh:\t%016x\n"
+ "CapPrm:\t%016x\n"
+ "CapEff:\t%016x\n",
+ current.__sigbits[1], current.__sigbits[0],
+ 0, 0, /* signals blocked on per thread basis */
+ ignore.__sigbits[1], ignore.__sigbits[0],
+ handle.__sigbits[1], handle.__sigbits[0],
+ /* Can't do anything with linux capabilities */
+ 0,
+ 0,
+ 0);
+
+ lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ struct as *as;
+ char stat;
+ pid_t pid, ppid, pgpid, spid;
+ gid_t psgid;
+ dev_t psdev;
+ size_t rss, vsize;
+ int nice, pri;
+ caddr_t wchan;
+ processorid_t cpu;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Set Linux defaults if we're the zone's init process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1; /* PID for init */
+ ppid = 0; /* parent PID for init is 0 */
+ pgpid = 0; /* process group for init is 0 */
+ psgid = (gid_t)-1; /* credential GID for init is -1 */
+ spid = 0; /* session id for init is 0 */
+ psdev = 0; /* session device for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP) ?
+ curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+
+ pgpid = p->p_pgrp;
+
+ mutex_enter(&p->p_splock);
+ mutex_enter(&p->p_sessp->s_lock);
+ spid = p->p_sessp->s_sid;
+ psdev = p->p_sessp->s_dev;
+ if (p->p_sessp->s_cred)
+ psgid = crgetgid(p->p_sessp->s_cred);
+ else
+ psgid = crgetgid(p->p_cred);
+
+ mutex_exit(&p->p_sessp->s_lock);
+ mutex_exit(&p->p_splock);
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ stat = 'S'; break;
+ case TS_RUN:
+ case TS_ONPROC:
+ stat = 'R'; break;
+ case TS_ZOMB:
+ stat = 'Z'; break;
+ case TS_STOPPED:
+ stat = 'T'; break;
+ default:
+ stat = '!'; break;
+ }
+
+ if (CL_DONICE(t, NULL, 0, &nice) != 0)
+ nice = 0;
+
+ pri = t->t_pri;
+ wchan = t->t_wchan;
+ cpu = t->t_cpu->cpu_id;
+ thread_unlock(t);
+ } else {
+ /* Only zombies have no threads */
+ stat = 'Z';
+ nice = 0;
+ pri = 0;
+ wchan = 0;
+ cpu = 0;
+ }
+ as = p->p_as;
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%d (%s) %c %d %d %d %d %d "
+ "%lu %lu %lu %lu %lu "
+ "%lu %lu %ld %ld "
+ "%d %d %d "
+ "%lu "
+ "%lu "
+ "%lu %ld %llu "
+ "%lu %lu %u "
+ "%lu %lu "
+ "%lu %lu %lu %lu "
+ "%lu "
+ "%lu %lu "
+ "%d "
+ "%d"
+ "\n",
+ pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+ 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+ p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+ pri, nice, p->p_lwpcnt,
+ 0l, /* itrealvalue (time before next SIGALRM) */
+ PTOU(p)->u_ticks,
+ vsize, rss, p->p_vmem_ctl,
+ 0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+ 0l, 0l, /* kstkesp, kstkeip */
+ 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+ wchan,
+ 0l, 0l, /* nswap, cnswap */
+ 0, /* exit_signal */
+ cpu);
+
+ lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf, "Inter-| Receive "
+ " | Transmit\n");
+ lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo"
+ " frame compressed multicast|bytes packets errs drop fifo"
+ " colls carrier compressed\n");
+
+ /*
+ * Data about each interface should go here, but that shouldn't be added
+ * unless there is an lxproc reader that actually makes use of it (and
+ * doesn't need anything else that we refuse to provide)...
+ */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define LX_KMSG_PRI "<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf)
+{
+ ldi_handle_t lh = lxpnp->lxpr_cons_ldih;
+ mblk_t *mp;
+
+ if (ldi_getmsg(lh, &mp, NULL) == 0) {
+ /*
+ * lxproc doesn't like successive reads to the same file
+ * descriptor unless we do an explicit rewind each time.
+ */
+ lxpr_uiobuf_seek(uiobuf, 0);
+
+ lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+ mp->b_cont->b_rptr);
+
+ freemsg(mp);
+ }
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ulong_t avenrun1;
+ ulong_t avenrun5;
+ ulong_t avenrun15;
+ ulong_t avenrun1_cs;
+ ulong_t avenrun5_cs;
+ ulong_t avenrun15_cs;
+ int loadavg[3];
+ int *loadbuf;
+ cpupart_t *cp;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ uint_t nrunnable = 0;
+ rctl_qty_t nlwps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+ mutex_enter(&cpu_lock);
+
+ /*
+ * Need to add up values over all CPU partitions. If pools are active,
+ * only report the values of the zone's partition, which by definition
+ * includes the current CPU.
+ */
+ if (pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+ ASSERT(curproc->p_zone != &zone0);
+ cp = CPU->cpu_part;
+
+ nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+ (void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+ loadbuf = &loadavg[0];
+ } else {
+ cp = cp_list_head;
+ do {
+ nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+ } while ((cp = cp->cp_next) != cp_list_head);
+
+ loadbuf = zone == global_zone ?
+ &avenrun[0] : zone->zone_avenrun;
+ }
+
+ /*
+ * If we're in the non-global zone, we'll report the total number of
+ * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+ * otherwise will just use nthread (which will include kernel threads,
+ * but should be good enough for lxproc).
+ */
+ nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+ mutex_exit(&cpu_lock);
+
+ avenrun1 = loadbuf[0] >> FSHIFT;
+ avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun5 = loadbuf[1] >> FSHIFT;
+ avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun15 = loadbuf[2] >> FSHIFT;
+ avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+ avenrun1, avenrun1_cs,
+ avenrun5, avenrun5_cs,
+ avenrun15, avenrun15_cs,
+ nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ int global = zone == global_zone;
+ long total_mem, free_mem, total_swap, used_swap;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+ if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+ total_mem = physmem * PAGESIZE;
+ free_mem = freemem * PAGESIZE;
+ } else {
+ total_mem = zone->zone_phys_mem_ctl;
+ free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+ }
+
+ if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+ total_swap = k_anoninfo.ani_max * PAGESIZE;
+ used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+ } else {
+ mutex_enter(&zone->zone_mem_lock);
+ total_swap = zone->zone_max_swap_ctl;
+ used_swap = zone->zone_max_swap;
+ mutex_exit(&zone->zone_mem_lock);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ " total: used: free: shared: buffers: cached:\n"
+ "Mem: %8lu %8lu %8lu %8u %8u %8u\n"
+ "Swap: %8lu %8lu %8lu\n"
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "MemShared: %8u kB\n"
+ "Buffers: %8u kB\n"
+ "Cached: %8u kB\n"
+ "SwapCached:%8u kB\n"
+ "Active: %8u kB\n"
+ "Inactive: %8u kB\n"
+ "HighTotal: %8u kB\n"
+ "HighFree: %8u kB\n"
+ "LowTotal: %8u kB\n"
+ "LowFree: %8u kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n",
+ total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+ total_swap, used_swap, total_swap - used_swap,
+ btok(total_mem), /* MemTotal */
+ btok(free_mem), /* MemFree */
+ 0, /* MemShared */
+ 0, /* Buffers */
+ 0, /* Cached */
+ 0, /* SwapCached */
+ 0, /* Active */
+ 0, /* Inactive */
+ 0, /* HighTotal */
+ 0, /* HighFree */
+ btok(total_mem), /* LowTotal */
+ btok(free_mem), /* LowFree */
+ btok(total_swap), /* SwapTotal */
+ btok(total_swap - used_swap)); /* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ zone_t *zone = LXPTOZ(lxpnp);
+ struct print_data {
+ refstr_t *vfs_mntpt;
+ refstr_t *vfs_resource;
+ uint_t vfs_flag;
+ int vfs_fstype;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *printp;
+
+ vfs_list_read_lock();
+
+ if (zone == global_zone) {
+ vfsp = vfslist = rootvfs;
+ } else {
+ vfsp = vfslist = zone->zone_vfslist;
+ /*
+ * If the zone has a root entry, it will be the first in
+ * the list. If it doesn't, we conjure one up.
+ */
+ if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+ zone->zone_rootpath) != 0) {
+ struct vfs *tvfsp;
+ /*
+ * The root of the zone is not a mount point. The vfs
+ * we want to report is that of the zone's root vnode.
+ */
+ tvfsp = zone->zone_rootvp->v_vfsp;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "/ / %s %s 0 0\n",
+ vfssw[tvfsp->vfs_fstype].vsw_name,
+ tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+ }
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ return;
+ }
+ }
+
+ /*
+ * Later on we have to do a lookupname, which can end up causing
+ * another vfs_list_read_lock() to be called. Which can lead to a
+ * deadlock. To avoid this, we extract the data we need into a local
+ * list, then we can run this list without holding vfs_list_read_lock()
+ * We keep the list in the same order as the vfs_list
+ */
+ do {
+ /* Skip mounts we shouldn't show */
+ if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+ goto nextfs;
+ }
+
+ printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+ refstr_hold(vfsp->vfs_mntpt);
+ printp->vfs_mntpt = vfsp->vfs_mntpt;
+ refstr_hold(vfsp->vfs_resource);
+ printp->vfs_resource = vfsp->vfs_resource;
+ printp->vfs_flag = vfsp->vfs_flag;
+ printp->vfs_fstype = vfsp->vfs_fstype;
+ printp->next = NULL;
+
+ *print_tail = printp;
+ print_tail = &printp->next;
+
+nextfs:
+ vfsp = (zone == global_zone) ?
+ vfsp->vfs_next : vfsp->vfs_zone_next;
+
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ /*
+ * now we can run through what we've extracted without holding
+ * vfs_list_read_lock()
+ */
+ printp = print_head;
+ while (printp != NULL) {
+ struct print_data *printp_next;
+ const char *resource;
+ char *mntpt;
+ struct vnode *vp;
+ int error;
+
+ mntpt = (char *)refstr_value(printp->vfs_mntpt);
+ resource = refstr_value(printp->vfs_resource);
+
+ if (mntpt != NULL && mntpt[0] != '\0')
+ mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+ else
+ mntpt = "-";
+
+ error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+ if (error != 0)
+ goto nextp;
+
+ if (!(vp->v_flag & VROOT)) {
+ VN_RELE(vp);
+ goto nextp;
+ }
+ VN_RELE(vp);
+
+ if (resource != NULL && resource[0] != '\0') {
+ if (resource[0] == '/') {
+ resource = ZONE_PATH_VISIBLE(resource, zone) ?
+ ZONE_PATH_TRANSLATE(resource, zone) :
+ mntpt;
+ }
+ } else {
+ resource = "-";
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%s %s %s %s 0 0\n",
+ resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+ printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+ printp_next = printp->next;
+ refstr_rele(printp->vfs_mntpt);
+ refstr_rele(printp->vfs_resource);
+ kmem_free(printp, sizeof (*printp));
+ printp = printp_next;
+
+ }
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices. But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "major minor #blocks name rio rmerge rsect ruse "
+ "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file. Note that
+ * we don't lie here -- we don't pretend that we're Linux. If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "%s version %s (%s version %d.%d.%d) "
+ "#%s SMP %s\n",
+ utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+ "gcc",
+ __GNUC__,
+ __GNUC_MINOR__,
+ __GNUC_PATCHLEVEL__,
+#else
+ "Sun C",
+ __SUNPRO_C / 0x100,
+ (__SUNPRO_C & 0xff) / 0x10,
+ __SUNPRO_C & 0xf,
+#endif
+ utsname.version,
+ "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t sys_cum = 0;
+ ulong_t user_cum = 0;
+ ulong_t irq_cum = 0;
+ uint_t cpu_nrunnable_cum = 0;
+ uint_t w_io_cum = 0;
+
+ ulong_t pgpgin_cum = 0;
+ ulong_t pgpgout_cum = 0;
+ ulong_t pgswapout_cum = 0;
+ ulong_t pgswapin_cum = 0;
+ ulong_t intr_cum = 0;
+ ulong_t pswitch_cum = 0;
+ ulong_t forks_cum = 0;
+ hrtime_t msnsecs[NCMSTATES];
+
+ /* temporary variable since scalehrtime modifies data in place */
+ hrtime_t tmptime;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ /* Calculate cumulative stats */
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ int i;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+ pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+ pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+ pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+ cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+ w_io_cum += CPU_STATS(cp, sys.iowait);
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_cum += NSEC_TO_TICK(tmptime);
+ }
+
+ for (i = 0; i < PIL_MAX; i++)
+ intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+ pswitch_cum += CPU_STATS(cp, sys.pswitch);
+ forks_cum += CPU_STATS(cp, sys.sysfork);
+ forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ lxpr_uiobuf_printf(uiobuf, "cpu %ld %ld %ld %ld %ld %ld %ld\n",
+ user_cum, 0, sys_cum, idle_cum, 0, irq_cum, 0);
+
+ /* Do per processor stats */
+ do {
+ int i;
+
+ ulong_t idle_ticks;
+ ulong_t sys_ticks;
+ ulong_t user_ticks;
+ ulong_t irq_ticks = 0;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_ticks += NSEC_TO_TICK(tmptime);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "cpu%d %ld %ld %ld %ld %ld %ld %ld\n",
+ cp->cpu_id, user_ticks, 0, sys_ticks, idle_ticks,
+ 0, irq_ticks, 0);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "page %lu %lu\n"
+ "swap %lu %lu\n"
+ "intr %lu\n"
+ "ctxt %lu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ pgpgin_cum, pgpgout_cum,
+ pgswapin_cum, pgswapout_cum,
+ intr_cum,
+ pswitch_cum,
+ boot_time,
+ forks_cum,
+ cpu_nrunnable_cum,
+ w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t cpu_count = 0;
+ ulong_t idle_s;
+ ulong_t idle_cs;
+ ulong_t up_s;
+ ulong_t up_cs;
+ hrtime_t birthtime;
+ hrtime_t centi_sec = 10000000; /* 10^7 */
+
+ ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+ /* Calculate cumulative stats */
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU;
+ do {
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+ cpu_count += 1;
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+ mutex_exit(&cpu_lock);
+
+ /* Getting the Zone zsched process startup time */
+ birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+ up_cs = (gethrtime() - birthtime) / centi_sec;
+ up_s = up_cs / 100;
+ up_cs %= 100;
+
+ ASSERT(cpu_count > 0);
+ idle_cum /= cpu_count;
+ idle_s = idle_cum / hz;
+ idle_cs = idle_cum % hz;
+ idle_cs *= 100;
+ idle_cs /= hz;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "mp",
+ "nx", NULL, "mmxext", NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", "3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+ "lahf_lm", NULL, "svm", NULL,
+ "altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+ "recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ "nx", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", NULL, NULL
+};
+
+static const char *intc_edx[] = {
+ "fpu", "vme", "de", "pse",
+ "tsc", "msr", "pae", "mce",
+ "cx8", "apic", NULL, "sep",
+ "mtrr", "pge", "mca", "cmov",
+ "pat", "pse36", "pn", "clflush",
+ NULL, "dts", "acpi", "mmx",
+ "fxsr", "sse", "sse2", "ss",
+ "ht", "tm", "ia64", "pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+ "pni", NULL, NULL, "monitor",
+ "ds_cpl", NULL, NULL, "est",
+ "tm2", NULL, "cid", NULL,
+ NULL, "cx16", "xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ int i;
+ uint32_t bits;
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ const char **fp;
+ char brandstr[CPU_IDSTRLEN];
+ struct cpuid_regs cpr;
+ int maxeax;
+ int std_ecx, std_edx, ext_ecx, ext_edx;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU;
+ do {
+ /*
+ * This returns the maximum eax value for standard cpuid
+ * functions in eax.
+ */
+ cpr.cp_eax = 0;
+ (void) cpuid_insn(cp, &cpr);
+ maxeax = cpr.cp_eax;
+
+ /*
+ * Get standard x86 feature flags.
+ */
+ cpr.cp_eax = 1;
+ (void) cpuid_insn(cp, &cpr);
+ std_ecx = cpr.cp_ecx;
+ std_edx = cpr.cp_edx;
+
+ /*
+ * Now get extended feature flags.
+ */
+ cpr.cp_eax = 0x80000001;
+ (void) cpuid_insn(cp, &cpr);
+ ext_ecx = cpr.cp_ecx;
+ ext_edx = cpr.cp_edx;
+
+ (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "processor\t: %d\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %d\n"
+ "model name\t: %s\n"
+ "stepping\t: %d\n"
+ "cpu MHz\t\t: %u.%03u\n",
+ cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+ cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+ (uint32_t)(cpu_freq_hz / 1000000),
+ ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+ lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+ getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+ if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+ /*
+ * 'siblings' is used for HT-style threads
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "physical id\t: %lu\n"
+ "siblings\t: %u\n",
+ pg_plat_hw_instance_id(cp, PGHW_CHIP),
+ cpuid_get_ncpu_per_chip(cp));
+ }
+
+ /*
+ * Since we're relatively picky about running on older hardware,
+ * we can be somewhat cavalier about the answers to these ones.
+ *
+ * In fact, given the hardware we support, we just say:
+ *
+ * fdiv_bug : no (if we're on a 64-bit kernel)
+ * hlt_bug : no
+ * f00f_bug : no
+ * coma_bug : no
+ * wp : yes (write protect in supervsr mode)
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "fdiv_bug\t: %s\n"
+ "hlt_bug \t: no\n"
+ "f00f_bug\t: no\n"
+ "coma_bug\t: no\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "flags\t\t:",
+#if defined(__i386)
+ fpu_pentium_fdivbug ? "yes" : "no",
+#else
+ "no",
+#endif /* __i386 */
+ fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+ maxeax);
+
+ for (bits = std_edx, fp = intc_edx, i = 0;
+ i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ /*
+ * name additional features where appropriate
+ */
+ switch (x86_vendor) {
+ case X86_VENDOR_Intel:
+ for (bits = ext_edx, fp = intc_x_edx, i = 0;
+ i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_AMD:
+ for (bits = ext_edx, fp = amd_x_edx, i = 0;
+ i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+ i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_TM:
+ for (bits = ext_edx, fp = tm_x_edx, i = 0;
+ i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+ default:
+ break;
+ }
+
+ for (bits = std_ecx, fp = intc_ecx, i = 0;
+ i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+ lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ register lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ extern uint_t nproc;
+ int error;
+
+ /*
+ * Return attributes of underlying vnode if ATTR_REAL
+ *
+ * but keep fd files with the symlink permissions
+ */
+ if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+
+ /*
+ * withold attribute information to owner or root
+ */
+ if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * now its attributes
+ */
+ if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * if it's a file in lx /proc/pid/fd/xx then set its
+ * mode and keep it looking like a symlink
+ */
+ if (type == LXPR_PID_FD_FD) {
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_type = vp->v_type;
+ vap->va_size = 0;
+ vap->va_nlink = 1;
+ }
+ return (0);
+ }
+
+ /* Default attributes, that may be overridden below */
+ bzero(vap, sizeof (*vap));
+ vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+ vap->va_nlink = 1;
+ vap->va_type = vp->v_type;
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_blksize = DEV_BSIZE;
+ vap->va_uid = lxpnp->lxpr_uid;
+ vap->va_gid = lxpnp->lxpr_gid;
+ vap->va_nodeid = lxpnp->lxpr_ino;
+
+ switch (type) {
+ case LXPR_PROCDIR:
+ vap->va_nlink = nproc + 2 + PROCDIRFILES;
+ vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+ break;
+ case LXPR_PIDDIR:
+ vap->va_nlink = PIDDIRFILES;
+ vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+ break;
+ case LXPR_SELF:
+ vap->va_uid = crgetruid(curproc->p_cred);
+ vap->va_gid = crgetrgid(curproc->p_cred);
+ break;
+ default:
+ break;
+ }
+
+ vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ int shift = 0;
+ proc_t *tp;
+
+ /* lx /proc is a read only file system */
+ if (mode & VWRITE)
+ return (EROFS);
+
+ /*
+ * If this is a restricted file, check access permissions.
+ */
+ switch (lxpnp->lxpr_type) {
+ case LXPR_PIDDIR:
+ return (0);
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ENV:
+ case LXPR_PID_EXE:
+ case LXPR_PID_MAPS:
+ case LXPR_PID_MEM:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_FDDIR:
+ case LXPR_PID_FD_FD:
+ if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+ return (ENOENT);
+ if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+ priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+ lxpr_unlock(tp);
+ return (EACCES);
+ }
+ lxpr_unlock(tp);
+ default:
+ break;
+ }
+
+ if (lxpnp->lxpr_realvp != NULL) {
+ /*
+ * For these we use the underlying vnode's accessibility.
+ */
+ return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+ }
+
+ /* If user is root allow access regardless of permission bits */
+ if (secpolicy_proc_access(cr) == 0)
+ return (0);
+
+ /*
+ * Access check is based on only one of owner, group, public. If not
+ * owner, then check group. If not a member of the group, then check
+ * public access.
+ */
+ if (crgetuid(cr) != lxpnp->lxpr_uid) {
+ shift += 3;
+ if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+ shift += 3;
+ }
+
+ mode &= ~(lxpnp->lxpr_mode << shift);
+
+ if (mode == 0)
+ return (0);
+
+ return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+ return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type >= 0 && type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the lookup
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict lookup permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Just return the parent vnode if that's where we are trying to go.
+ */
+ if (strcmp(comp, "..") == 0) {
+ VN_HOLD(lxpnp->lxpr_parent);
+ *vpp = lxpnp->lxpr_parent;
+ return (0);
+ }
+
+ /*
+ * Special handling for directory searches. Note: null component name
+ * denotes that the current directory is being searched.
+ */
+ if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+ VN_HOLD(dp);
+ *vpp = dp;
+ return (0);
+ }
+
+ *vpp = (lxpr_lookup_function[type](dp, comp));
+ return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ lxpr_node_t *lxpnp;
+ int count;
+
+ for (count = 0; count < dirtablen; count++) {
+ if (strcmp(dirtab[count].d_name, comp) == 0) {
+ lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ return (dp);
+ }
+ }
+ return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+ proc_t *p;
+
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+ p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+ if (p == NULL)
+ return (NULL);
+
+ dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+ lxpr_unlock(p);
+
+ return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+ lxpr_node_t *lxpnp;
+ vnode_t *vp = NULL;
+ proc_t *p;
+ file_t *fp;
+ uint_t fd;
+ int c;
+ uf_entry_t *ufp;
+ uf_info_t *fip;
+
+ ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ /*
+ * convert the string rendition of the filename
+ * to a file descriptor
+ */
+ fd = 0;
+ while ((c = *comp++) != '\0') {
+ int ofd;
+ if (c < '0' || c > '9')
+ return (NULL);
+
+ ofd = fd;
+ fd = 10*fd + c - '0';
+ /* integer overflow */
+ if (fd / 10 != ofd)
+ return (NULL);
+ }
+
+ /*
+ * get the proc to work with and lock it
+ */
+ p = lxpr_lock(dlxpnp->lxpr_pid);
+ if ((p == NULL))
+ return (NULL);
+
+ /*
+ * If the process is a zombie or system process
+ * it can't have any open files.
+ */
+ if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * get us a fresh node/vnode
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+ /*
+ * get open file info
+ */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+
+ /*
+ * got the fd data so now done with this proc
+ */
+ lxpr_unlock(p);
+
+ if (fd < fip->fi_nfiles) {
+ UF_ENTER(ufp, fip, fd);
+ /*
+ * ensure the fd is still kosher.
+ * it may have gone between the readdir and
+ * the lookup
+ */
+ if (fip->fi_list[fd].uf_file == NULL) {
+ mutex_exit(&fip->fi_lock);
+ UF_EXIT(ufp);
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ }
+
+ if ((fp = ufp->uf_file) != NULL)
+ vp = fp->f_vnode;
+ UF_EXIT(ufp);
+ }
+ mutex_exit(&fip->fi_lock);
+
+ if (vp == NULL) {
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ } else {
+ /*
+ * Fill in the lxpr_node so future references will be able to
+ * find the underlying vnode. The vnode is held on the realvp.
+ */
+ lxpnp->lxpr_realvp = vp;
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+ dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+ /*
+ * We know all the names of files & dirs in our file system structure
+ * except those that are pid names. These change as pids are created/
+ * deleted etc., so we just look for a number as the first char to see
+ * if we are we doing pid lookups.
+ *
+ * Don't need to check for "self" as it is implemented as a symlink
+ */
+ if (*comp >= '0' && *comp <= '9') {
+ pid_t pid = 0;
+ lxpr_node_t *lxpnp = NULL;
+ proc_t *p;
+ int c;
+
+ while ((c = *comp++) != '\0')
+ pid = 10 * pid + c - '0';
+
+ /*
+ * Can't continue if the process is still loading or it doesn't
+ * really exist yet (or maybe it just died!)
+ */
+ p = lxpr_lock(pid);
+ if (p == NULL)
+ return (NULL);
+
+ if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * allocate and fill in a new lxpr node
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+ lxpr_unlock(p);
+
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+ }
+
+ /* Lookup fixed names */
+ return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type >= 0 && type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the readdir
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict readdir permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+ return (error);
+
+ uoffset = uiop->uio_offset;
+ uresid = uiop->uio_resid;
+
+ /* can't do negative reads */
+ if (uoffset < 0 || uresid <= 0)
+ return (EINVAL);
+
+ /* can't read directory entries that don't exist! */
+ if (uoffset % LXPR_SDSIZE)
+ return (ENOENT);
+
+ return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Satisfy user request
+ */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXPR_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxpnp->lxpr_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXPR_SDSIZE) {
+
+ dirent->d_ino = lxpr_parentinode(lxpnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex < dirtablen) {
+ int slen = strlen(dirtab[dirindex].d_name);
+
+ dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+ lxpnp->lxpr_pid, 0);
+
+ ASSERT(slen < LXPNSIZ);
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, ignoring what uiomove() does.
+ */
+ if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) {
+ return (error);
+ }
+
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ if (eofp) {
+ *eofp =
+ (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+ return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ zoneid_t zoneid;
+ pid_t pid;
+ int error;
+ int ceof;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+ oresid = uiop->uio_resid;
+ zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ /*
+ * We return directory entries in the order: "." and ".." then the
+ * unique lxproc files, then the directories corresponding to the
+ * running processes. We have defined this as the ordering because
+ * it allows us to more easily keep track of where we are betwen calls
+ * to getdents(). If the number of processes changes between calls
+ * then we can't lose track of where we are in the lxproc files.
+ */
+
+ /* Do the fixed entries */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+ PROCDIRFILES);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ return (error);
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Do the process entries */
+ while ((uresid = uiop->uio_resid) > 0) {
+ proc_t *p;
+ int len;
+ int reclen;
+ int i;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop when entire proc table has been examined.
+ */
+ i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+ if (i >= v.v_proc) {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+ mutex_enter(&pidlock);
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, a PID of 0,
+ * and anything the security policy doesn't allow
+ * us to look at.
+ */
+ if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+ p->p_pid == 0 ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ mutex_exit(&pidlock);
+ goto next;
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's
+ * init process, otherwise use the value from the proc
+ * structure
+ */
+ pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+ p->p_pid : 1);
+
+ /*
+ * If this /proc was mounted in the global zone, view
+ * all procs; otherwise, only view zone member procs.
+ */
+ if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+ goto next;
+ }
+
+ ASSERT(p->p_stat != 0);
+
+ dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ return (EINVAL);
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, in the increment of this for
+ * the loop, ignoring what uiomove() does.
+ */
+ if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
+ return (error);
+next:
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ if (eofp != NULL) {
+ *eofp = (uiop->uio_offset >=
+ ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+ return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ proc_t *p;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+ /* can't read its contents if it died */
+ mutex_enter(&pidlock);
+
+ p = prfind((lxpnp->lxpr_pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (ENOENT);
+ }
+ mutex_exit(&pidlock);
+
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+ int ceof;
+ proc_t *p;
+ int fddirsize;
+ uf_info_t *fip;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ oresid = uiop->uio_resid;
+
+ /* can't read its contents if it died */
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL)
+ return (ENOENT);
+
+ /* Get open file info */
+ fip = (&(p)->p_user.u_finfo);
+
+ if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+ fddirsize = 0;
+ } else {
+ fddirsize = fip->fi_nfiles;
+ }
+
+ mutex_enter(&fip->fi_lock);
+ lxpr_unlock(p);
+
+ /* Do the fixed entries (in this case just "." & "..") */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ return (error);
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Loop until user's request is satisfied or until
+ * all file descriptors have been examined.
+ */
+ for (; (uresid = uiop->uio_resid) > 0;
+ uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+ int reclen;
+ int fd;
+ int len;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop at the end of the fd list
+ */
+ fd = (uoffset / LXPR_SDSIZE) - 2;
+ if (fd >= fddirsize) {
+ if (eofp) {
+ *eofp = 1;
+ }
+ goto out;
+ }
+
+ if (fip->fi_list[fd].uf_file == NULL)
+ continue;
+
+ dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ error = EINVAL;
+ goto out;
+ }
+
+ if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
+ goto out;
+ }
+
+ if (eofp != NULL) {
+ *eofp =
+ (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+out:
+ mutex_exit(&fip->fi_lock);
+ return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ char bp[MAXPATHLEN + 1];
+ size_t buflen = sizeof (bp);
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+ pid_t pid;
+ int error = 0;
+
+ /* must be a symbolic link file */
+ if (vp->v_type != VLNK)
+ return (EINVAL);
+
+ /* Try to produce a symlink name for anything that has a realvp */
+ if (rvp != NULL) {
+ if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+ return (error);
+ if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+ return (error);
+ } else {
+ switch (lxpnp->lxpr_type) {
+ case LXPR_SELF:
+ /*
+ * Convert pid to the Linux default of 1 if we're the
+ * zone's init process
+ */
+ pid = ((curproc->p_pid !=
+ curproc->p_zone->zone_proc_initpid)
+ ? curproc->p_pid : 1);
+
+ /*
+ * Don't need to check result as every possible int
+ * will fit within MAXPATHLEN bytes.
+ */
+ (void) snprintf(bp, buflen, "%d", pid);
+ break;
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_EXE:
+ return (EACCES);
+ default:
+ /*
+ * Need to return error so that nothing thinks
+ * that the symlink is empty and hence "."
+ */
+ return (EINVAL);
+ }
+ }
+
+ /* copy the link data to user space */
+ return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+ /*
+ * Nothing to sync but this function must never fail
+ */
+ return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ while (vn_matchops(vp1, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+ vp1 = rvp;
+ }
+
+ while (vn_matchops(vp2, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+ vp2 = rvp;
+ }
+
+ if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+ return (vp1 == vp2);
+
+ return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+ vp = rvp;
+ if (VOP_REALVP(vp, &rvp, ct) == 0)
+ vp = rvp;
+ }
+
+ *vpp = vp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..a06bef1570
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,275 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _LXPROC_H
+#define _LXPROC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define LX_SIGHUP 1
+#define LX_SIGINT 2
+#define LX_SIGQUIT 3
+#define LX_SIGILL 4
+#define LX_SIGTRAP 5
+#define LX_SIGABRT 6
+#define LX_SIGIOT 6
+#define LX_SIGBUS 7
+#define LX_SIGFPE 8
+#define LX_SIGKILL 9
+#define LX_SIGUSR1 10
+#define LX_SIGSEGV 11
+#define LX_SIGUSR2 12
+#define LX_SIGPIPE 13
+#define LX_SIGALRM 14
+#define LX_SIGTERM 15
+#define LX_SIGSTKFLT 16
+#define LX_SIGCHLD 17
+#define LX_SIGCONT 18
+#define LX_SIGSTOP 19
+#define LX_SIGTSTP 20
+#define LX_SIGTTIN 21
+#define LX_SIGTTOU 22
+#define LX_SIGURG 23
+#define LX_SIGXCPU 24
+#define LX_SIGXFSZ 25
+#define LX_SIGVTALRM 26
+#define LX_SIGPROF 27
+#define LX_SIGWINCH 28
+#define LX_SIGIO 29
+#define LX_SIGPOLL LX_SIGIO
+#define LX_SIGPWR 30
+#define LX_SIGSYS 31
+#define LX_SIGUNUSED 31
+
+#define LX_NSIG_WORDS 2
+#define LX_NBPW 32
+#define LX_NSIG ((LX_NBPW * LX_NSIG_WORDS) + 1)
+
+#define LX_SIGRTMIN 32
+#define LX_SIGRTMAX LX_NSIG - 1
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define LXPTOZ(lxpnp) \
+ (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define LXPNSIZ 256 /* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define LXPR_SDSIZE 16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+ LXPR_PROCDIR, /* /proc */
+ LXPR_PIDDIR, /* /proc/<pid> */
+ LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */
+ LXPR_PID_CPU, /* /proc/<pid>/cpu */
+ LXPR_PID_CURDIR, /* /proc/<pid>/cwd */
+ LXPR_PID_ENV, /* /proc/<pid>/environ */
+ LXPR_PID_EXE, /* /proc/<pid>/exe */
+ LXPR_PID_MAPS, /* /proc/<pid>/maps */
+ LXPR_PID_MEM, /* /proc/<pid>/mem */
+ LXPR_PID_ROOTDIR, /* /proc/<pid>/root */
+ LXPR_PID_STAT, /* /proc/<pid>/stat */
+ LXPR_PID_STATM, /* /proc/<pid>/statm */
+ LXPR_PID_STATUS, /* /proc/<pid>/status */
+ LXPR_PID_FDDIR, /* /proc/<pid>/fd */
+ LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */
+ LXPR_CMDLINE, /* /proc/cmdline */
+ LXPR_CPUINFO, /* /proc/cpuinfo */
+ LXPR_DEVICES, /* /proc/devices */
+ LXPR_DMA, /* /proc/dma */
+ LXPR_FILESYSTEMS, /* /proc/filesystems */
+ LXPR_INTERRUPTS, /* /proc/interrupts */
+ LXPR_IOPORTS, /* /proc/ioports */
+ LXPR_KCORE, /* /proc/kcore */
+ LXPR_KMSG, /* /proc/kmsg */
+ LXPR_LOADAVG, /* /proc/loadavg */
+ LXPR_MEMINFO, /* /proc/meminfo */
+ LXPR_MOUNTS, /* /proc/mounts */
+ LXPR_NETDIR, /* /proc/net */
+ LXPR_NET_ARP, /* /proc/net/arp */
+ LXPR_NET_DEV, /* /proc/net/dev */
+ LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */
+ LXPR_NET_IGMP, /* /proc/net/igmp */
+ LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */
+ LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */
+ LXPR_NET_MCFILTER, /* /proc/net/mcfilter */
+ LXPR_NET_NETSTAT, /* /proc/net/netstat */
+ LXPR_NET_RAW, /* /proc/net/raw */
+ LXPR_NET_ROUTE, /* /proc/net/route */
+ LXPR_NET_RPC, /* /proc/net/rpc */
+ LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */
+ LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */
+ LXPR_NET_SNMP, /* /proc/net/snmp */
+ LXPR_NET_STAT, /* /proc/net/stat */
+ LXPR_NET_TCP, /* /proc/net/tcp */
+ LXPR_NET_UDP, /* /proc/net/udp */
+ LXPR_NET_UNIX, /* /proc/net/unix */
+ LXPR_PARTITIONS, /* /proc/partitions */
+ LXPR_SELF, /* /proc/self */
+ LXPR_STAT, /* /proc/stat */
+ LXPR_UPTIME, /* /proc/uptime */
+ LXPR_VERSION, /* /proc/version */
+ LXPR_NFILES /* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define LXPRMAXNAMELEN 14
+typedef struct {
+ lxpr_nodetype_t d_type;
+ char d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+ lxpr_nodetype_t lxpr_type; /* type of this node */
+ vnode_t *lxpr_vnode; /* vnode for the node */
+ vnode_t *lxpr_parent; /* parent directory */
+ vnode_t *lxpr_realvp; /* real vnode, file in dirs */
+ timestruc_t lxpr_time; /* creation etc time for file */
+ mode_t lxpr_mode; /* file mode bits */
+ uid_t lxpr_uid; /* file owner */
+ gid_t lxpr_gid; /* file group owner */
+ pid_t lxpr_pid; /* pid of proc referred to */
+ ino_t lxpr_ino; /* node id */
+ ldi_handle_t lxpr_cons_ldih; /* ldi handle for console device */
+} lxpr_node_t;
+
+struct zone; /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+ lxpr_node_t *lxprm_node; /* node at root of proc mount */
+ struct zone *lxprm_zone; /* zone for this mount */
+ ldi_ident_t lxprm_li; /* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t *lxpr_vnodeops;
+extern int nproc_highbit; /* highbit(v.v_nproc) */
+
+typedef struct mounta mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index ad2fed01dc..8473788d8a 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
*/
/*
@@ -2520,6 +2521,9 @@ nfs_srvinit(void)
{
int error;
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (EACCES);
+
error = nfs_exportinit();
if (error != 0)
return (error);
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index 55a48bb2cc..53709139cc 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/param.h>
@@ -935,7 +939,7 @@ pr_control32(int32_t cmd, arg32_t *argp, prnode_t *pnp, cred_t *cr)
case PCREAD: /* read from the address space */
case PCWRITE: /* write to the address space */
- if (PROCESS_NOT_32BIT(p))
+ if (PROCESS_NOT_32BIT(p) || (pnp->pr_flags & PR_OFFMAX))
error = EOVERFLOW;
else {
enum uio_rw rw = (cmd == PCREAD)? UIO_READ : UIO_WRITE;
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 1294421f9f..ce925778f2 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
@@ -183,6 +187,7 @@ typedef struct prnode {
#define PR_INVAL 0x01 /* vnode is invalidated */
#define PR_ISSELF 0x02 /* vnode is a self-open */
#define PR_AOUT 0x04 /* vnode is for an a.out path */
+#define PR_OFFMAX 0x08 /* vnode is a large file open */
/*
* Conversion macros.
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index a3e95a60fc..7831c1f9ea 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -23,6 +23,10 @@
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
*/
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
@@ -337,6 +341,15 @@ propen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
}
/*
+ * If this is a large file open, indicate that in our flags -- some
+ * procfs structures are not off_t-neutral (e.g., priovec_t), and
+ * the open will need to be differentiated where 32-bit processes
+ * pass these structures across the user/kernel boundary.
+ */
+ if (flag & FOFFMAX)
+ pnp->pr_flags |= PR_OFFMAX;
+
+ /*
* Do file-specific things.
*/
switch (type) {
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
* memory that can be used as swap space should do so by
* setting swapfs_desfree at boot time, not swapfs_minfree.
* However, swapfs_minfree is tunable by install as a
- * workaround for bugid 1147463.
+ * workaround for bugid 1147463. Note swapfs_minfree is set
+ * to 1/8th of memory, but clamped at the limit of 256 MB.
*/
- new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+ new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+ btopr(256 * 1024 * 1024));
}
/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..f22cc3ecf0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -76,7 +77,7 @@ static vfsdef_t vfw = {
VFSDEF_VERSION,
"tmpfs",
tmpfsinit,
- VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+ VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
&tmpfs_proto_opttbl
};
@@ -249,7 +250,7 @@ tmp_mount(
return (ENOTDIR);
mutex_enter(&mvp->v_lock);
- if ((uap->flags & MS_OVERLAY) == 0 &&
+ if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
(mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
mutex_exit(&mvp->v_lock);
return (EBUSY);
@@ -286,6 +287,21 @@ tmp_mount(
(uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
goto out;
+ if (uap->flags & MS_REMOUNT) {
+ tm = (struct tmount *)VFSTOTM(vfsp);
+
+ /*
+ * If we change the size so its less than what is currently
+ * being used, we allow that. The file system will simply be
+ * full until enough files have been removed to get below the
+ * new max.
+ */
+ mutex_enter(&tm->tm_contents);
+ tm->tm_anonmax = anonmax;
+ mutex_exit(&tm->tm_contents);
+ goto out;
+ }
+
if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
pn_free(&dpn);
error = ENOMEM;
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index e24f2d3b32..8442894ecd 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -3877,6 +3877,8 @@ vfs_to_modname(const char *vfstype)
vfstype = "fdfs";
} else if (strncmp(vfstype, "nfs", 3) == 0) {
vfstype = "nfs";
+ } else if (strcmp(vfstype, "lxproc") == 0) {
+ vfstype = "lxprocfs";
}
return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 382369c7fc..67f21866ec 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -65,6 +66,7 @@
#include <fs/fs_subr.h>
#include <sys/taskq.h>
#include <fs/fs_reparse.h>
+#include <sys/time.h>
/* Determine if this vnode is a file that is read-only */
#define ISROFILE(vp) \
@@ -199,6 +201,11 @@ static void (**vsd_destructor)(void *);
cr = crgetmapped(cr); \
}
+#define VOP_LATENCY_10MS 10000000
+#define VOP_LATENCY_100MS 100000000
+#define VOP_LATENCY_1S 1000000000
+#define VOP_LATENCY_10S 10000000000
+
/*
* Convert stat(2) formats to vnode types and vice versa. (Knows about
* numerical order of S_IFMT and vnode types.)
@@ -3220,14 +3227,57 @@ fop_read(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start, lat;
+ ssize_t len;
+ int err;
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_runq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, read,
- read_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.reads++;
+ zonep->zone_vfs_rwstats.nread += len;
+ kstat_runq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
@@ -3239,14 +3289,62 @@ fop_write(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start, lat;
+ ssize_t len;
+ int err;
+
+ /*
+ * For the purposes of VFS kstat consumers, the "waitq" calculation is
+ * repurposed as the active queue for VFS write operations. There's no
+ * actual wait queue for VFS operations.
+ */
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, write,
- write_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.writes++;
+ zonep->zone_vfs_rwstats.nwritten += len;
+ kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index e039b9cac4..98aad58025 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -125,6 +126,7 @@
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -2017,6 +2019,16 @@ arc_reclaim_needed(void)
if (availrmem < swapfs_minfree + swapfs_reserve + extra)
return (1);
+ /*
+ * Check that we have enough availrmem that memory locking (e.g., via
+ * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
+ * stores the number of pages that cannot be locked; when availrmem
+ * drops below pages_pp_maximum, page locking mechanisms such as
+ * page_pp_lock() will fail.)
+ */
+ if (availrmem <= pages_pp_maximum)
+ return (1);
+
#if defined(__i386)
/*
* If we're on an i386 platform, it's possible that we'll exhaust the
@@ -2917,6 +2929,14 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
+ /*
+ * At this point, this read I/O has already missed in the ARC
+ * and will be going through to the disk. The I/O throttle
+ * should delay this I/O if this zone is using more than its I/O
+ * priority allows.
+ */
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
if (*arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -3484,9 +3504,6 @@ arc_init(void)
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
arc_meta_limit = zfs_arc_meta_limit;
- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
- arc_c_min = arc_meta_limit / 2;
-
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 7a0abd22b5..16e42b951a 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -2703,7 +2703,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dt.dl.dr_copies);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 39234eba53..743f5c4656 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu.h>
@@ -950,6 +951,7 @@ xuio_stat_wbuf_nocopy()
}
#ifdef _KERNEL
+
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
@@ -1562,7 +1564,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (wp & WP_NOFILL) {
ASSERT(!ismd && level == 0);
- checksum = ZIO_CHECKSUM_OFF;
+ checksum = ZIO_CHECKSUM_NOPARITY;
compress = ZIO_COMPRESS_OFF;
dedup = B_FALSE;
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index b4579e278c..2301942907 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -39,11 +39,11 @@
#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
#include <sys/varargs.h>
+#include <sys/zfs_zone.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
-
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
if (len == 0)
return;
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
min_bs = SPA_MINBLOCKSHIFT;
max_bs = SPA_MAXBLOCKSHIFT;
min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index 5ef7f54af1..aeeefd178e 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu_objset.h>
@@ -4133,9 +4134,13 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
dsl_dataset_t *snap;
uint64_t used, comp, uncomp;
- err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
- if (err != 0)
- break;
+ if (snapobj == new->ds_object) {
+ snap = new;
+ } else {
+ err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+ if (err != 0)
+ break;
+ }
if (snap->ds_phys->ds_prev_snap_txg ==
oldsnap->ds_phys->ds_creation_txg) {
@@ -4164,7 +4169,8 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
* was not a snapshot of/before new.
*/
snapobj = snap->ds_phys->ds_prev_snap_obj;
- dsl_dataset_rele(snap, FTAG);
+ if (snap != new)
+ dsl_dataset_rele(snap, FTAG);
if (snapobj == 0) {
err = EINVAL;
break;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 1cd49c8274..b6af7598e2 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -36,6 +36,7 @@
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
#include "zfs_namecheck.h"
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -839,7 +840,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else {
if (err == EAGAIN) {
- txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ zfs_zone_txg_delay());
err = ERESTART;
}
dsl_pool_memory_pressure(dd->dd_pool);
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 418a04c7c2..316b37cebd 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -40,6 +40,7 @@
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
@@ -529,11 +530,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
/*
* If this transaction group is over 7/8ths capacity, delay
- * the caller 1 clock tick. This will slow down the "fill"
- * rate until the sync process can catch up with us.
+ * the caller some number of clock ticks. This will slow down the
+ * "fill" rate until the sync process can catch up with us.
*/
if (reserved && reserved > (write_limit - (write_limit >> 3)))
- txg_delay(dp, tx->tx_txg, 1);
+ txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay());
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
index b748571ea0..ffca0a7dcb 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -21,13 +21,12 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/buf.h>
@@ -40,14 +39,22 @@
extern "C" {
#endif
+#ifdef _KERNEL
typedef struct vdev_disk {
ddi_devid_t vd_devid;
char *vd_minor;
ldi_handle_t vd_lh;
} vdev_disk_t;
+#endif
+extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
#ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
#endif
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 1df61a587d..c297ae165c 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -102,6 +102,7 @@ struct vdev_queue {
avl_tree_t vq_read_tree;
avl_tree_t vq_write_tree;
avl_tree_t vq_pending_tree;
+ zoneid_t vq_last_zone_id;
kmutex_t vq_lock;
};
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..496b718bd6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define _SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#include <sys/semaphore.h>
+#include <sys/buf.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, uint64_t, boolean_t);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..069ec004f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZONE_H
+#define _SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_ZONE_IOP_READ = 0,
+ ZFS_ZONE_IOP_WRITE,
+ ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern int zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index f6cf3f5349..032b77715f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _ZIO_H
@@ -79,6 +80,8 @@ enum zio_checksum {
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_SHA256_MAC,
+ ZIO_CHECKSUM_NOPARITY,
ZIO_CHECKSUM_FUNCTIONS
};
@@ -421,6 +424,9 @@ struct zio {
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
+ zoneid_t io_zoneid; /* zone which originated this I/O */
+ hrtime_t io_start; /* time I/O entered zio pipeline */
+ hrtime_t io_dispatched; /* time I/O was dispatched to disk */
/* Taskq dispatching state */
taskq_ent_t io_tqent;
};
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 55b1f3884b..2269ef271e 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -30,6 +30,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
+#include <sys/zfs_zone.h>
/*
* Pool-wide transaction groups.
@@ -411,6 +412,8 @@ txg_sync_thread(dsl_pool_t *dp)
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
+ zfs_zone_report_txg_sync(dp);
+
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index d7417736b4..f78580d0f1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -20,9 +20,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
@@ -325,8 +327,18 @@ vdev_disk_close(vdev_t *vd)
}
int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
- uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+ size_t size, uint64_t offset, int flags)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ ASSERT(vd->vdev_ops == &vdev_disk_ops);
+ return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+ size_t size, uint64_t offset, int flags)
{
buf_t *bp;
int error = 0;
@@ -479,6 +491,8 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
+ zfs_zone_zio_start(zio);
+
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
@@ -490,6 +504,8 @@ vdev_disk_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
+ zfs_zone_zio_done(zio);
+
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
@@ -574,7 +590,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
- if (vdev_disk_physio(vd_lh, (caddr_t)label,
+ if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5a0d3ee970..4ea958a9f6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,12 +21,14 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
+#include <sys/zfs_zone.h>
/*
* These tunables are for performance analysis.
@@ -120,6 +122,8 @@ vdev_queue_init(vdev_t *vd)
avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+ vq->vq_last_zone_id = 0;
}
void
@@ -139,6 +143,7 @@ static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
avl_add(&vq->vq_deadline_tree, zio);
+ zfs_zone_zio_enqueue(zio);
avl_add(zio->io_vdev_tree, zio);
}
@@ -146,6 +151,7 @@ static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
avl_remove(&vq->vq_deadline_tree, zio);
+ zfs_zone_zio_dequeue(zio);
avl_remove(zio->io_vdev_tree, zio);
}
@@ -188,7 +194,11 @@ again:
avl_numnodes(&vq->vq_deadline_tree) == 0)
return (NULL);
+#ifdef _KERNEL
+ fio = lio = zfs_zone_schedule(vq);
+#else
fio = lio = avl_first(&vq->vq_deadline_tree);
+#endif
t = fio->io_vdev_tree;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 4b0f5602c1..6094e01876 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -21,11 +21,15 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
@@ -152,6 +156,8 @@ typedef struct raidz_map {
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
+
/*
* Force reconstruction to use the general purpose method.
*/
@@ -431,12 +437,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
};
static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
- uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
- uint64_t b = zio->io_offset >> unit_shift;
- uint64_t s = zio->io_size >> unit_shift;
+ uint64_t b = offset >> unit_shift;
+ uint64_t s = size >> unit_shift;
uint64_t f = b % dcols;
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
@@ -506,7 +512,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++)
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
@@ -535,7 +541,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -547,8 +553,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_skipstart = 1;
}
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -1491,6 +1495,104 @@ vdev_raidz_close(vdev_t *vd)
vdev_close(vd->vdev_child[c]);
}
+/*
+ * Handle a read or write request to a RAID-Z dump device.
+ *
+ * Unlike the normal RAID-Z codepath in vdev_raidz_io_start(), reads and writes
+ * to the dump zvol are written across a full 128Kb block. As a result, an
+ * individual I/O may not span all columns in the RAID-Z map; moreover, a small
+ * I/O may only span a single column.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+ uint64_t offset, uint64_t origoffset, boolean_t doread)
+{
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, err = 0;
+
+ uint64_t start, end, colstart, colend;
+ uint64_t coloffset, colsize, colskip;
+
+ int flags = doread ? B_READ : B_WRITE;
+
+#ifdef _KERNEL
+
+ /*
+ * Don't write past the end of the block
+ */
+ VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this I/O operation doesn't span the full block size, let's
+ * treat the on-disk format as if the only blocks are the complete 128k
+ * size.
+ */
+ start = offset;
+ end = start + size;
+
+ /*
+ * Allocate a RAID-Z map for this block. Note that this block starts
+ * from the "original" offset, this is, the offset of the extent which
+ * contains the requisite offset of the data being read or written.
+ */
+ rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+ SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
+
+ coloffset = origoffset;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, coloffset += rc->rc_size) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Find the start and end of this column in the RAID-Z matrix,
+ * keeping in mind that the stated size and offset of the
+ * operation may not fill the entire column for this vdev.
+ *
+ * If any portion of the data being read or written spans this
+ * column, issue the appropriate operation to the child vdev.
+ */
+ if (coloffset + rc->rc_size <= start)
+ continue;
+ if (coloffset >= end)
+ continue;
+
+ colstart = MAX(coloffset, start);
+ colend = MIN(end, coloffset + rc->rc_size);
+ colsize = colend - colstart;
+ colskip = colstart - coloffset;
+
+ VERIFY3U(colsize, <=, rc->rc_size);
+ VERIFY3U(colskip, <=, rc->rc_size);
+
+ /*
+ * Note that the child vdev will have a vdev label at the start
+ * of its range of offsets, hence the need for
+ * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
+ * example of why this calculation is needed.
+ */
+ if ((err = vdev_disk_physio(cvd,
+ ((char *)rc->rc_data) + colskip, colsize,
+ VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+ flags)) != 0)
+ break;
+ }
+
+ vdev_raidz_map_free(rm);
+#endif /* KERNEL */
+
+ return (err);
+}
+
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
@@ -1526,9 +1628,13 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+ tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1659,6 +1765,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
int c, ret = 0;
raidz_col_t *rc;
+ blkptr_t *bp = zio->io_bp;
+ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 929fc06296..baffc223a3 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -23,6 +23,7 @@
* Portions Copyright 2011 Martin Matuska
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -1757,7 +1758,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
}
static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+ boolean_t cachedpropsonly)
{
int error = 0;
nvlist_t *nv;
@@ -1775,7 +1777,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZVOL) {
+ dmu_objset_type(os) == DMU_OST_ZVOL &&
+ !cachedpropsonly) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
@@ -1802,13 +1805,25 @@ static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os = NULL;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
- if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
return (error);
- error = zfs_ioc_objset_stats_impl(zc, os);
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
+ if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+ return (error);
+ error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
dmu_objset_rele(os, FTAG);
return (error);
@@ -2022,8 +2037,21 @@ static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
top:
if (zc->zc_cookie == 0)
(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
@@ -2072,8 +2100,10 @@ top:
objset_t *ossnap;
error = dmu_objset_from_ds(ds, &ossnap);
- if (error == 0)
- error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc,
+ ossnap, cachedpropsonly);
+ }
dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
@@ -2789,6 +2819,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
+ int error;
ASSERT(zplprops != NULL);
@@ -2832,8 +2863,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
- if (norm == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+ if (norm == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
@@ -2842,13 +2874,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if (norm)
u8 = 1;
- if (u8 == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+ if (u8 == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
- if (sense == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+ if (sense == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 0c39274caf..9fae31fa6b 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -25,6 +25,10 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -4145,6 +4149,8 @@ top:
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
}
dmu_tx_commit(tx);
@@ -4655,27 +4661,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
return (0);
}
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
/* ARGSUSED */
static int
zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -4687,10 +4672,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..08f4f38e04
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1179 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+int
+zfs_zone_txg_delay()
+{
+ return (1);
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
+uint16_t zfs_zone_delay_step = 5; /* amount to change delay */
+uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */
+
+hrtime_t zfs_zone_last_checked = 0;
+
+boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O. The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down. As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency. If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t zfs_zone_rw_lat_limit = 10;
+
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level. This tunable controls the threshold
+ * at which the throttle will start delaying zones. The calculation should
+ * correspond closely with the %b column from iostat.
+ */
+uint_t zfs_zone_util_threshold = 80;
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds. Our system
+ * average cycle is one second or 1 million microseconds. Our zone counter
+ * update cycle is two seconds or 2 million microseconds. We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
+uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
+
+uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
+
+typedef struct {
+ hrtime_t cycle_start;
+ int cycle_cnt;
+ hrtime_t cycle_lat;
+ hrtime_t sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+ hrtime_t zi_now;
+ uint_t zi_avgrlat;
+ uint_t zi_avgwlat;
+ uint64_t zi_totpri;
+ uint64_t zi_totutil;
+ int zi_active;
+ uint_t zi_diskutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t rd_lat;
+static sys_lat_cycle_t wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization.
+ */
+kmutex_t zfs_disk_lock;
+uint_t zfs_disk_rcnt;
+hrtime_t zfs_disk_rtime = 0;
+hrtime_t zfs_disk_rlastupdate = 0;
+
+hrtime_t zfs_disk_last_rtime = 0;
+
+/*
+ * Data used to keep track of how often txg flush is running.
+ */
+extern int zfs_txg_timeout;
+static uint_t txg_last_check;
+static uint_t txg_cnt;
+static uint_t txg_flush_rate;
+
+boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the
+ * number of I/Os that can be pending on a device. If there are more than a
+ * few ops already queued up, beyond those already issued to the vdev, then
+ * use scheduling to get the next zio.
+ */
+int zfs_zone_schedule_thresh = 5;
+
+/*
+ * Tunables for delay throttling when TxG flush is occurring.
+ */
+int zfs_zone_txg_throttle_scale = 2;
+int zfs_zone_txg_delay_ticks = 2;
+
+typedef struct {
+ int zq_qdepth;
+ int zq_priority;
+ int zq_wt;
+ zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define GET_USEC_TIME (gethrtime() / 1000)
+#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * If the number of ops is >1 then we can just use that value. However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system. We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made. If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new historical count.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new zone count.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_cycle_time)
+ return (delta);
+
+ /* A previous cycle is past, compute the new zone count. */
+
+ /*
+ * Figure out how many generations we have to decay the historical
+ * count, since multiple cycles may have elapsed since our last IO.
+ * We depend on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+ /* If more than 5 cycles since last the IO, reset count. */
+ if (gen_cnt > 5) {
+ cp->zone_avg_cnt = 0;
+ } else {
+ /* Update the count. */
+ int i;
+
+ /*
+ * If the zone did more than 1 IO, just use its current count
+ * as the historical value, otherwise decay the historical
+ * count and factor that into the new historical count. We
+ * pick a threshold > 1 so that we don't lose track of IO due
+ * to int rounding.
+ */
+ if (cp->cycle_cnt > 1)
+ cp->zone_avg_cnt = cp->cycle_cnt;
+ else
+ cp->zone_avg_cnt = cp->cycle_cnt +
+ (cp->zone_avg_cnt / 2);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+
+ return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+ zonep->zone_rd_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+ zonep->zone_wr_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_LOGICAL_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+ zonep->zone_lwr_ops.cycle_cnt++;
+ break;
+ }
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen
+ *
+ * at fixed intervals, we use a timestamp to see when the last update was made.
+ * If it was more than one cycle ago, then we need to decay the average by the
+ * proper number of additional cycles in which no IO was performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static int
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new average.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_sys_avg_cycle)
+ return (0);
+
+ /* A previous cycle is past, compute a new system average. */
+
+ /*
+ * Figure out how many generations we have to decay, since multiple
+ * cycles may have elapsed since our last IO.
+ * We count on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+ /* If more than 5 cycles since last the IO, reset average. */
+ if (gen_cnt > 5) {
+ cp->sys_avg_lat = 0;
+ } else {
+ /* Update the average. */
+ int i;
+
+ cp->sys_avg_lat =
+ (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->sys_avg_lat = cp->sys_avg_lat / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+ cp->cycle_lat = 0;
+
+ return (1);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_new_sys_avg(unow, &rd_lat);
+ rd_lat.cycle_cnt++;
+ rd_lat.cycle_lat += lat;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_new_sys_avg(unow, &wr_lat);
+ wr_lat.cycle_cnt++;
+ wr_lat.cycle_lat += lat;
+ break;
+ }
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ uint_t cnt;
+
+ if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ cnt = cp->zone_avg_cnt;
+ } else {
+ /*
+ * If we're less than half way through the cycle then use
+ * the current count plus half the historical count, otherwise
+ * just use the current count.
+ */
+ if (delta < (zfs_zone_cycle_time / 2))
+ cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+ else
+ cnt = cp->cycle_cnt;
+ }
+
+ return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ if (compute_new_sys_avg(unow, cp)) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ return (cp->sys_avg_lat);
+ } else {
+ /*
+ * We're within a cycle; weight the current activity higher
+ * compared to the historical data and use that.
+ */
+ extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t,
+ uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__calc__wt__avg(
+ (uintptr_t)cp->sys_avg_lat,
+ (uintptr_t)cp->cycle_lat,
+ (uintptr_t)cp->cycle_cnt);
+
+ return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+ (1 + (cp->cycle_cnt * 8)));
+ }
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+ /* Add op to zone */
+ add_zone_iop(zonep, unow, op);
+
+ /* Track system latency */
+ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+ add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone. If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+ uint_t *lwops)
+{
+ *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+ *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+ *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+ extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t,
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id,
+ (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops);
+
+ return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+ *rlat = calc_avg_lat(unow, &rd_lat);
+ *wlat = calc_avg_lat(unow, &wr_lat);
+
+ /*
+ * In an attempt to improve the accuracy of the throttling algorithm,
+ * assume that IO operations can't have zero latency. Instead, assume
+ * a reasonable lower bound for each operation type. If the actual
+ * observed latencies are non-zero, use those latency values instead.
+ */
+ if (*rlat == 0)
+ *rlat = 1000;
+ if (*wlat == 0)
+ *wlat = 1000;
+
+ extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t,
+ uintptr_t);
+
+ __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat),
+ (uintptr_t)*wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint_t rops, wops, lwops;
+
+ if (zonep->zone_id == GLOBAL_ZONEID ||
+ get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+ zonep->zone_io_util = 0;
+ return (0);
+ }
+
+ zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+ (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+ sp->zi_totutil += zonep->zone_io_util;
+
+ if (zonep->zone_io_util > 0) {
+ sp->zi_active++;
+ sp->zi_totpri += zonep->zone_zfs_io_pri;
+ }
+
+ /*
+ * sdt:::zfs-zone-utilization
+ *
+ * arg0: zone ID
+ * arg1: read operations observed during time window
+ * arg2: physical write operations observed during time window
+ * arg3: logical write ops observed during time window
+ * arg4: calculated utilization given read and write ops
+ * arg5: I/O priority assigned to this zone
+ */
+ extern void __dtrace_probe_zfs__zone__utilization(
+ uint_t, uint_t, uint_t, uint_t, uint_t, uint_t);
+
+ __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id),
+ (uint_t)rops, (uint_t)wops, (uint_t)lwops,
+ (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri);
+
+ return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+ if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+ zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+ if (zonep->zone_io_delay > 0)
+ zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay. Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint16_t delay = zonep->zone_io_delay;
+ uint_t fairutil = 0;
+
+ zonep->zone_io_util_above_avg = B_FALSE;
+
+ /*
+ * Given the calculated total utilitzation for all zones, calculate the
+ * fair share of I/O for this zone.
+ */
+ if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+ fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+ sp->zi_totpri;
+ } else if (sp->zi_active > 0) {
+ fairutil = sp->zi_totutil / sp->zi_active;
+ }
+
+ /*
+ * Adjust each IO's delay. If the overall delay becomes too high, avoid
+ * increasing beyond the ceiling value.
+ */
+ if (zonep->zone_io_util > fairutil &&
+ sp->zi_diskutil > zfs_zone_util_threshold) {
+ zonep->zone_io_util_above_avg = B_TRUE;
+
+ if (sp->zi_active > 1)
+ zfs_zone_delay_inc(zonep);
+ } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) {
+ zfs_zone_delay_dec(zonep);
+ }
+
+ /*
+ * sdt:::zfs-zone-throttle
+ *
+ * arg0: zone ID
+ * arg1: old delay for this zone
+ * arg2: new delay for this zone
+ * arg3: calculated fair I/O utilization
+ * arg4: actual I/O utilization
+ */
+ extern void __dtrace_probe_zfs__zone__throttle(
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__throttle(
+ (uintptr_t)zonep->zone_id, (uintptr_t)delay,
+ (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil,
+ (uintptr_t)zonep->zone_io_util);
+
+ return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow)
+{
+ zoneio_stats_t stats;
+
+ (void) bzero(&stats, sizeof (stats));
+
+ stats.zi_now = unow;
+ get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+ if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+ stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+ else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+ stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+ if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+ return;
+
+ /*
+ * Calculate disk utilization for the most recent period.
+ */
+ if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) {
+ stats.zi_diskutil = 0;
+ } else {
+ stats.zi_diskutil =
+ ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+ ((unow - zfs_zone_last_checked) * 1000);
+ }
+ zfs_disk_last_rtime = zfs_disk_rtime;
+
+ /*
+ * sdt:::zfs-zone-stats
+ *
+ * Statistics observed over the last period:
+ *
+ * arg0: average system read latency
+ * arg1: average system write latency
+ * arg2: number of active zones
+ * arg3: total I/O 'utilization' for all zones
+ * arg4: total I/O priority of all active zones
+ * arg5: calculated disk utilization
+ */
+ extern void __dtrace_probe_zfs__zone__stats(
+ uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat),
+ (uintptr_t)(stats.zi_avgwlat),
+ (uintptr_t)(stats.zi_active),
+ (uintptr_t)(stats.zi_totutil),
+ (uintptr_t)(stats.zi_totpri),
+ (uintptr_t)(stats.zi_diskutil));
+
+ (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue. Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+ int pri;
+ zone_q_bump_t *qbp = arg;
+
+ extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t);
+ __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id),
+ (uintptr_t)(zonep->zone_zfs_queued));
+
+ if (zonep->zone_zfs_queued == 0) {
+ zonep->zone_zfs_weight = 0;
+ return (0);
+ }
+
+ /*
+ * On each pass, increment the zone's weight. We use this as input
+ * to the calculation to prevent starvation. The value is reset
+ * each time we issue an IO for this zone so zones which haven't
+ * done any IO over several iterations will see their weight max
+ * out.
+ */
+ if (zonep->zone_zfs_weight < 20)
+ zonep->zone_zfs_weight++;
+
+ /*
+ * This zone's IO priority is the inverse of the number of IOs
+ * the zone has enqueued * zone's configured priority * weight.
+ * The queue depth has already been scaled by 10 to avoid problems
+ * with int rounding.
+ *
+ * This means that zones with fewer IOs in the queue will get
+ * preference unless other zone's assigned priority pulls them
+ * ahead. The weight is factored in to help ensure that zones
+ * which haven't done IO in a while aren't getting starved.
+ */
+ pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) *
+ zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+ /*
+ * If this zone has a higher priority than what we found so far,
+ * schedule it next.
+ */
+ if (pri > qbp->zq_priority) {
+ qbp->zq_zoneid = zonep->zone_id;
+ qbp->zq_priority = pri;
+ qbp->zq_wt = zonep->zone_zfs_weight;
+ }
+ return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue.
+ *
+ * For single-threaded synchronous workloads a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple workloads
+ * in parallel. This can cause an imbalance in performance if there are zones
+ * with many parallel workloads (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded workloads, such as interactive tasks in the
+ * shell. These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result. This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority. We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_t *vq, int qdepth)
+{
+ zone_q_bump_t qbump;
+ zio_t *zp = NULL, *zphead;
+ int cnt = 0;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ /* To avoid problems with int rounding, scale the queue depth by 10 */
+ qbump.zq_qdepth = qdepth * 10;
+ qbump.zq_priority = 0;
+ qbump.zq_zoneid = 0;
+ (void) zone_walk(get_sched_pri_cb, &qbump);
+
+ zphead = avl_first(&vq->vq_deadline_tree);
+
+ /* Check if the scheduler didn't pick a zone for some reason!? */
+ if (qbump.zq_zoneid != 0) {
+ for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL;
+ zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) {
+ if (zp->io_zoneid == qbump.zq_zoneid)
+ break;
+ cnt++;
+ }
+ }
+
+ if (zp == NULL) {
+ zp = zphead;
+ } else if (zp != zphead) {
+ /*
+ * Only fire the probe if we actually picked a different zio
+ * than the one already at the head of the queue.
+ */
+ extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t,
+ uintptr_t, uintptr_t, uintptr_t);
+ __dtrace_probe_zfs__zone__sched__bump(
+ (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt),
+ (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt));
+ }
+
+ return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out RX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+ zone_t *zonep = curzone;
+
+ zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track IO operations per zone. Called from dmu_tx_count_write for write ops
+ * and dmu_read_uio for read ops. For each operation, increment that zone's
+ * counter based on the type of operation.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls. Those ops go into a TXG which
+ * we'll count here. Sometime later a kernel taskq thread (we'll see the
+ * vdev IO as zone 0) will perform some number of physical writes to commit
+ * the TXG to disk. Those writes are not associated with the zone which
+ * made the write syscalls and the number of operations is not correlated
+ * between the taskq and the zone.
+ * 2) An application opens a file with O_SYNC. Each write will result in
+ * an operation which we'll see here plus a low-level vdev write from
+ * that zone.
+ * 3) An application does write syscalls followed by an fsync(). We'll
+ * count the writes going into a TXG here. We'll also see some number
+ * (usually much smaller, maybe only 1) of low-level vdev writes from this
+ * zone when the fsync is performed, plus some other low-level vdev writes
+ * from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ * writing out dirty pages to swap, or sync(2) calls, which will be handled
+ * by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice because this is called
+ * at a high level by a zone thread, but we also will count the phys. writes
+ * that are performed at a low level via zfs_zone_zio_start.
+ *
+ * Without this, it can look like a non-global zone never writes (case 1).
+ * Depending on when the TXG is flushed, the counts may be in the same sample
+ * bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics. The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through dmu_read_uio.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+ zone_t *zonep = curzone;
+ hrtime_t unow;
+ uint16_t wait;
+
+ unow = GET_USEC_TIME;
+
+ /*
+ * Only bump the counters for logical operations here. The counters for
+ * tracking physical IO operations are handled in zfs_zone_zio_done.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, type, 0);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ if (!zfs_zone_delay_enable)
+ return;
+
+ /*
+ * XXX There's a potential race here in that more than one thread may
+ * update the zone delays concurrently. The worst outcome is corruption
+ * of our data to track each zone's IO, so the algorithm may make
+ * incorrect throttling decisions until the data is refreshed.
+ */
+ if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) {
+ zfs_zone_wait_adjust(unow);
+ zfs_zone_last_checked = unow;
+ }
+
+ if ((wait = zonep->zone_io_delay) > 0) {
+ /*
+ * If this is a write and we're doing above normal TxG
+ * flushing, then throttle for longer than normal.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+ (txg_cnt > 1 || txg_flush_rate > 1))
+ wait *= zfs_zone_txg_throttle_scale;
+
+ /*
+ * sdt:::zfs-zone-wait
+ *
+ * arg0: zone ID
+ * arg1: type of IO operation
+ * arg2: time to delay (in us)
+ */
+ extern void __dtrace_probe_zfs__zone__wait(
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id),
+ (uintptr_t)type, (uintptr_t)wait);
+
+ drv_usecwait(wait);
+
+ if (zonep->zone_vfs_stats != NULL) {
+ atomic_inc_64(&zonep->zone_vfs_stats->
+ zv_delay_cnt.value.ui64);
+ atomic_add_64(&zonep->zone_vfs_stats->
+ zv_delay_time.value.ui64, wait);
+ }
+ }
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TxG flush rate is running above the expected rate.
+ * If so, this implies that we are filling TxG's at a high rate due to a heavy
+ * write workload. We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load. In this case, the flush rate is going to be 1. When there
+ * is a heavy write load, TxG's fill up fast and the sync thread will write
+ * the TxG more frequently (perhaps once a second). In this case the rate
+ * will be > 1. The flush rate is a lagging indicator since it can be up
+ * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_flush_rate to keep track of the previous
+ * 5 second interval. In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+ uint_t now;
+
+ txg_cnt++;
+ now = (uint_t)(gethrtime() / NANOSEC);
+ if ((now - txg_last_check) >= zfs_txg_timeout) {
+ txg_flush_rate = txg_cnt / 2;
+ txg_cnt = 0;
+ txg_last_check = now;
+ }
+}
+
+int
+zfs_zone_txg_delay()
+{
+ zone_t *zonep = curzone;
+ int delay = 1;
+
+ if (zonep->zone_io_util_above_avg)
+ delay = zfs_zone_txg_delay_ticks;
+
+ extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id),
+ (uintptr_t)delay);
+
+ return (delay);
+}
+
+/*
+ * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+ zone_t *zonep;
+
+ /*
+ * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+ * an actual I/O operation. Ignore those operations as they relate to
+ * throttling and scheduling.
+ */
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_zfs_lock);
+ if (zp->io_type == ZIO_TYPE_READ)
+ kstat_runq_enter(&zonep->zone_zfs_rwstats);
+ zonep->zone_zfs_weight = 0;
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zp->io_dispatched = gethrtime();
+
+ if (zfs_disk_rcnt++ != 0)
+ zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = zp->io_dispatched;
+ mutex_exit(&zfs_disk_lock);
+
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+ zone_t *zonep;
+ hrtime_t now, unow, udelta;
+
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ now = gethrtime();
+ unow = NANO_TO_MICRO(now);
+ udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+ mutex_enter(&zonep->zone_zfs_lock);
+
+ /*
+ * To calculate the wsvc_t average, keep a cumulative sum of all the
+ * wait time before each I/O was dispatched. Since most writes are
+ * asynchronous, only track the wait time for read I/Os.
+ */
+ if (zp->io_type == ZIO_TYPE_READ) {
+ zonep->zone_zfs_rwstats.reads++;
+ zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+ zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+ zp->io_dispatched - zp->io_start;
+
+ kstat_runq_exit(&zonep->zone_zfs_rwstats);
+ } else {
+ zonep->zone_zfs_rwstats.writes++;
+ zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+ }
+
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zfs_disk_rcnt--;
+ zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = now;
+ mutex_exit(&zfs_disk_lock);
+
+ if (zfs_zone_delay_enable) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+ ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ zone_rele(zonep);
+
+ /*
+ * sdt:::zfs-zone-latency
+ *
+ * arg0: zone ID
+ * arg1: type of I/O operation
+ * arg2: I/O latency (in us)
+ */
+ extern void __dtrace_probe_zfs__zone__latency(
+ uintptr_t, uintptr_t, uintptr_t);
+
+ __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid),
+ (uintptr_t)(zp->io_type), (uintptr_t)(udelta));
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+ zone_t *zonep;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ ASSERT(zonep->zone_zfs_queued > 0);
+ if (zonep->zone_zfs_queued == 0)
+ cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+ else
+ zonep->zone_zfs_queued--;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+ zone_t *zonep;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ zonep->zone_zfs_queued++;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. This function is where zio's are found
+ * at the head of the queue (by avl_first), then pulled off (by
+ * vdev_queue_io_remove) and issued. We do our scheduling here to find the
+ * next zio to issue.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq)
+{
+ int cnt;
+ zoneid_t last_zone;
+ zio_t *zp;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ cnt = avl_numnodes(&vq->vq_deadline_tree);
+ last_zone = vq->vq_last_zone_id;
+
+ /*
+ * If there are only a few ops in the queue then just issue the head.
+ * If there are more than a few ops already queued up, then use
+ * scheduling to get the next zio.
+ */
+ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+ zp = avl_first(&vq->vq_deadline_tree);
+ else
+ zp = get_next_zio(vq, cnt);
+
+ vq->vq_last_zone_id = zp->io_zoneid;
+
+ /*
+ * Probe with 3 args; the number of IOs in the queue, the zone that
+ * was last scheduled off this queue, and the zone that was associated
+ * with the next IO that is scheduled.
+ */
+ extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t,
+ uintptr_t);
+
+ __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt),
+ (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid));
+
+ return (zp);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 64e9acbae1..89c88bc181 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -36,6 +36,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/ddt.h>
+#include <sys/zfs_zone.h>
/*
* ==========================================================================
@@ -501,6 +502,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
bzero(zio, sizeof (zio_t));
+ zio->io_start = gethrtime();
+
mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
@@ -552,11 +555,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ zio->io_zoneid = pio->io_zoneid;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
+ } else {
+ zfs_zone_zio_init(zio);
}
return (zio);
@@ -894,6 +900,8 @@ zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
+ zio->io_start = gethrtime();
+
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
@@ -2279,6 +2287,9 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+ if (zio->io_type == ZIO_TYPE_WRITE)
+ zio->io_start = gethrtime();
+
if (vd == NULL) {
if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index c8fe20f2eb..c7dd90c45d 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -77,6 +78,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256_mac"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"},
};
enum zio_checksum
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index df9a16bccb..4dc63888fd 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -76,9 +76,11 @@
#include <sys/zfs_rlock.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/dumphdr.h>
#include <sys/zil_impl.h>
+#include <sys/sdt.h>
#include "zfs_namecheck.h"
@@ -1059,27 +1061,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
}
static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
- boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+ uint64_t size, boolean_t doread, boolean_t isdump)
{
vdev_disk_t *dvd;
int c;
int numerrors = 0;
- for (c = 0; c < vd->vdev_children; c++) {
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
- vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- int err = zvol_dumpio_vdev(vd->vdev_child[c],
- addr, offset, size, doread, isdump);
- if (err != 0) {
- numerrors++;
- } else if (doread) {
- break;
+ if (vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, origoffset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
}
}
- if (!vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
return (numerrors < vd->vdev_children ? 0 : EIO);
if (doread && !vdev_readable(vd))
@@ -1087,19 +1090,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
else if (!doread && !vdev_writeable(vd))
return (EIO);
- dvd = vd->vdev_tsd;
- ASSERT3P(dvd, !=, NULL);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ return (vdev_raidz_physio(vd,
+ addr, size, offset, origoffset, doread));
+ }
+
offset += VDEV_LABEL_START_SIZE;
if (ddi_in_panic() || isdump) {
ASSERT(!doread);
if (doread)
return (EIO);
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
lbtodb(size)));
} else {
- return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
- doread ? B_READ : B_WRITE));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+
+ return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+ offset, doread ? B_READ : B_WRITE));
}
}
@@ -1131,7 +1142,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
offset += DVA_GET_OFFSET(&ze->ze_dva);
- error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+ error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+ size, doread, isdump);
if (!ddi_in_panic())
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1322,6 +1334,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1340,6 +1354,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
}
}
zfs_range_unlock(rl);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+ error);
+
return (error);
}
@@ -1369,6 +1387,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
sync = !(zv->zv_flags & ZVOL_WCE) ||
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
@@ -1399,6 +1419,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
zfs_range_unlock(rl);
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+ error);
+
return (error);
}
@@ -1852,7 +1876,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
ZIO_COMPRESS_OFF) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- ZIO_CHECKSUM_OFF) == 0);
+ ZIO_CHECKSUM_NOPARITY) == 0);
if (version >= SPA_VERSION_DEDUP) {
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_DEDUP),
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 42adb4c451..bd50364310 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
*/
@@ -2195,6 +2196,8 @@ struct ip_xmit_attr_s {
*/
ixa_notify_t ixa_notify; /* Registered upcall notify function */
void *ixa_notify_cookie; /* ULP cookie for ixa_notify */
+
+ uint_t ixa_tcpcleanup; /* Used by conn_ixa_cleanup */
};
/*
@@ -2266,6 +2269,14 @@ struct ip_xmit_attr_s {
#define IXA_FREE_TSL 0x00000002 /* ixa_tsl needs to be rele */
/*
+ * Trivial state machine used to synchronize IXA cleanup for TCP connections.
+ * See conn_ixa_cleanup().
+ */
+#define IXATC_IDLE 0x00000000
+#define IXATC_INPROGRESS 0x00000001
+#define IXATC_COMPLETE 0x00000002
+
+/*
* Simplistic way to set the ixa_xmit_hint for locally generated traffic
* and forwarded traffic. The shift amount are based on the size of the
* structs to discard the low order bits which don't have much if any variation
@@ -3030,6 +3041,7 @@ extern vmem_t *ip_minor_arena_la;
#define ips_ip_strict_src_multihoming ips_propinfo_tbl[80].prop_cur_uval
#define ips_ipv6_strict_src_multihoming ips_propinfo_tbl[81].prop_cur_uval
#define ips_ipv6_drop_inbound_icmpv6 ips_propinfo_tbl[82].prop_cur_bval
+#define ips_ip_dce_reclaim_threshold ips_propinfo_tbl[83].prop_cur_uval
extern int dohwcksum; /* use h/w cksum if supported by the h/w */
#ifdef ZC_TEST
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 3197858f8e..e040af14ba 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
*/
if (ixa->ixa_free_flags & IXA_FREE_CRED)
crhold(ixa->ixa_cred);
+
+ /*
+ * There is no cleanup in progress on this new copy.
+ */
+ ixa->ixa_tcpcleanup = IXATC_IDLE;
}
/*
@@ -1176,6 +1181,59 @@ ixa_cleanup_stale(ip_xmit_attr_t *ixa)
}
}
+static mblk_t *
+tcp_ixa_cleanup_getmblk(conn_t *connp)
+{
+ tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+ int need_retry;
+ mblk_t *mp;
+
+ mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+ /*
+ * It's possible that someone else came in and started cleaning up
+ * another connection between the time we verified this one is not being
+ * cleaned up and the time we actually get the shared mblk. If that's
+ * the case, we've dropped the lock, and some other thread may have
+ * cleaned up this connection again, and is still waiting for
+ * notification of that cleanup's completion. Therefore we need to
+ * recheck.
+ */
+ do {
+ need_retry = 0;
+ while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
+ cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+ &tcps->tcps_ixa_cleanup_lock);
+ }
+
+ while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
+ /*
+ * Multiple concurrent cleanups; need to have the last
+ * one run since it could be an unplumb.
+ */
+ need_retry = 1;
+ cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
+ &tcps->tcps_ixa_cleanup_lock);
+ }
+ } while (need_retry);
+
+ /*
+ * We now have the lock and the mblk; now make sure that no one else can
+ * try to clean up this connection or enqueue it for cleanup, clear the
+ * mblk pointer for this stack, drop the lock, and return the mblk.
+ */
+ ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
+ ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
+ ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
+ ASSERT(mp != NULL);
+
+ connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
+ tcps->tcps_ixa_cleanup_mp = NULL;
+ mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+
+ return (mp);
+}
+
/*
* Used to run ixa_cleanup_stale inside the tcp squeue.
* When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
@@ -1195,11 +1253,39 @@ tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
mutex_enter(&tcps->tcps_ixa_cleanup_lock);
ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
+ connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
tcps->tcps_ixa_cleanup_mp = mp;
- cv_signal(&tcps->tcps_ixa_cleanup_cv);
+ cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
+ /*
+ * It is possible for any number of threads to be waiting for cleanup of
+ * different connections. Absent a per-connection (or per-IXA) CV, we
+ * need to wake them all up even though only one can be waiting on this
+ * particular cleanup.
+ */
+ cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
mutex_exit(&tcps->tcps_ixa_cleanup_lock);
}
+static void
+tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
+{
+ tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+
+ mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+ ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
+
+ while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
+ cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+ &tcps->tcps_ixa_cleanup_lock);
+ }
+
+ ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
+ connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
+ cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
+
+ mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+}
/*
* ipcl_walk() function to help release any IRE, NCE, or DCEs that
@@ -1214,21 +1300,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
if (IPCL_IS_TCP(connp)) {
mblk_t *mp;
- tcp_stack_t *tcps;
-
- tcps = connp->conn_netstack->netstack_tcp;
- mutex_enter(&tcps->tcps_ixa_cleanup_lock);
- while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
- /*
- * Multiple concurrent cleanups; need to have the last
- * one run since it could be an unplumb.
- */
- cv_wait(&tcps->tcps_ixa_cleanup_cv,
- &tcps->tcps_ixa_cleanup_lock);
- }
- tcps->tcps_ixa_cleanup_mp = NULL;
- mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+ mp = tcp_ixa_cleanup_getmblk(connp);
if (connp->conn_sqp->sq_run == curthread) {
/* Already on squeue */
@@ -1237,15 +1310,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
-
- /* Wait until tcp_ixa_cleanup has run */
- mutex_enter(&tcps->tcps_ixa_cleanup_lock);
- while (tcps->tcps_ixa_cleanup_mp == NULL) {
- cv_wait(&tcps->tcps_ixa_cleanup_cv,
- &tcps->tcps_ixa_cleanup_lock);
- }
- mutex_exit(&tcps->tcps_ixa_cleanup_lock);
}
+ tcp_ixa_cleanup_wait_and_finish(connp);
} else if (IPCL_IS_SCTP(connp)) {
sctp_t *sctp;
sctp_faddr_t *fp;
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
index 215bc4675f..502ee8a735 100644
--- a/usr/src/uts/common/inet/ip/ip_dce.c
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -32,6 +33,7 @@
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/atomic.h>
+#include <sys/callb.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
@@ -102,7 +104,19 @@ static void dce_delete_locked(dcb_t *, dce_t *);
static void dce_make_condemned(dce_t *);
static kmem_cache_t *dce_cache;
+static kthread_t *dce_reclaim_thread;
+static kmutex_t dce_reclaim_lock;
+static kcondvar_t dce_reclaim_cv;
+static int dce_reclaim_shutdown;
+/* Global so it can be tuned in /etc/system. This must be a power of two. */
+uint_t ip_dce_hash_size = 1024;
+
+/* The time in seconds between executions of the IP DCE reclaim worker. */
+uint_t ip_dce_reclaim_interval = 60;
+
+/* The factor of the DCE threshold at which to start hard reclaims */
+uint_t ip_dce_reclaim_threshold_hard = 2;
/* Operates on a uint64_t */
#define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
@@ -117,6 +131,11 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
uint_t fraction_pmtu = fraction*4;
uint_t hash;
dce_t *dce, *nextdce;
+ hrtime_t seed = gethrtime();
+ uint_t retained = 0;
+ uint_t max = ipst->ips_ip_dce_reclaim_threshold;
+
+ max *= ip_dce_reclaim_threshold_hard;
rw_enter(&dcb->dcb_lock, RW_WRITER);
for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
@@ -132,13 +151,21 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
} else {
mutex_exit(&dce->dce_lock);
}
- hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
- if (dce->dce_flags & DCEF_PMTU) {
- if (hash % fraction_pmtu != 0)
- continue;
- } else {
- if (hash % fraction != 0)
- continue;
+
+ if (max == 0 || retained < max) {
+ hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
+
+ if (dce->dce_flags & DCEF_PMTU) {
+ if (hash % fraction_pmtu != 0) {
+ retained++;
+ continue;
+ }
+ } else {
+ if (hash % fraction != 0) {
+ retained++;
+ continue;
+ }
+ }
}
IP_STAT(ipst, ip_dce_reclaim_deleted);
@@ -175,17 +202,19 @@ ip_dce_reclaim_stack(ip_stack_t *ipst)
}
/*
- * Called by the memory allocator subsystem directly, when the system
- * is running low on memory.
+ * Called by dce_reclaim_worker() below, and no one else. Typically this will
+ * mean that the number of entries in the hash buckets has exceeded a tunable
+ * threshold.
*/
-/* ARGSUSED */
-void
-ip_dce_reclaim(void *args)
+static void
+ip_dce_reclaim(void)
{
netstack_handle_t nh;
netstack_t *ns;
ip_stack_t *ipst;
+ ASSERT(curthread == dce_reclaim_thread);
+
netstack_next_init(&nh);
while ((ns = netstack_next(&nh)) != NULL) {
/*
@@ -196,26 +225,75 @@ ip_dce_reclaim(void *args)
netstack_rele(ns);
continue;
}
- ip_dce_reclaim_stack(ipst);
+ if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
+ ip_dce_reclaim_stack(ipst);
netstack_rele(ns);
}
netstack_next_fini(&nh);
}
+/* ARGSUSED */
+static void
+dce_reclaim_worker(void *arg)
+{
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
+ "dce_reclaim_worker");
+
+ mutex_enter(&dce_reclaim_lock);
+ while (!dce_reclaim_shutdown) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
+ ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
+ CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
+
+ if (dce_reclaim_shutdown)
+ break;
+
+ mutex_exit(&dce_reclaim_lock);
+ ip_dce_reclaim();
+ mutex_enter(&dce_reclaim_lock);
+ }
+
+ ASSERT(MUTEX_HELD(&dce_reclaim_lock));
+ dce_reclaim_thread = NULL;
+ dce_reclaim_shutdown = 0;
+ cv_broadcast(&dce_reclaim_cv);
+ CALLB_CPR_EXIT(&cprinfo); /* drops the lock */
+
+ thread_exit();
+}
+
void
dce_g_init(void)
{
dce_cache = kmem_cache_create("dce_cache",
- sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+ sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
+
+ dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
}
void
dce_g_destroy(void)
{
+ mutex_enter(&dce_reclaim_lock);
+ dce_reclaim_shutdown = 1;
+ cv_signal(&dce_reclaim_cv);
+ while (dce_reclaim_thread != NULL)
+ cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
+ mutex_exit(&dce_reclaim_lock);
+
+ cv_destroy(&dce_reclaim_cv);
+ mutex_destroy(&dce_reclaim_lock);
+
kmem_cache_destroy(dce_cache);
}
-
/*
* Allocate a default DCE and a hash table for per-IP address DCEs
*/
@@ -234,7 +312,7 @@ dce_stack_init(ip_stack_t *ipst)
ipst->ips_dce_default->dce_ipst = ipst;
/* This must be a power of two since we are using IRE_ADDR_HASH macro */
- ipst->ips_dce_hashsize = 256;
+ ipst->ips_dce_hashsize = ip_dce_hash_size;
ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
sizeof (dcb_t), KM_SLEEP);
ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
@@ -414,6 +492,12 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
dcb = &ipst->ips_dce_hash_v4[hash];
+ /*
+ * Assuming that we get fairly even distribution across all of the
+ * buckets, once one bucket is overly full, prune the whole cache.
+ */
+ if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+ atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
rw_enter(&dcb->dcb_lock, RW_WRITER);
for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
if (dce->dce_v4addr == dst) {
@@ -447,6 +531,7 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
dce->dce_ptpn = &dcb->dcb_dce;
dcb->dcb_dce = dce;
dce->dce_bucket = dcb;
+ atomic_add_32(&dcb->dcb_cnt, 1);
dce_refhold(dce); /* For the caller */
rw_exit(&dcb->dcb_lock);
@@ -476,6 +561,12 @@ dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
dcb = &ipst->ips_dce_hash_v6[hash];
+ /*
+ * Assuming that we get fairly even distribution across all of the
+ * buckets, once one bucket is overly full, prune the whole cache.
+ */
+ if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+ atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
rw_enter(&dcb->dcb_lock, RW_WRITER);
for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c
index 516d6c1a21..1e249b493e 100644
--- a/usr/src/uts/common/inet/ip/ip_tunables.c
+++ b/usr/src/uts/common/inet/ip/ip_tunables.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -908,6 +909,11 @@ mod_prop_info_t ip_propinfo_tbl[] = {
#else
{ "", 0, NULL, NULL, {0}, {0} },
#endif
+
+ { "_dce_reclaim_threshold", MOD_PROTO_IP,
+ mod_set_uint32, mod_get_uint32,
+ {1, 100000, 32}, {32} },
+
{ "mtu", MOD_PROTO_IPV4, NULL, ip_get_mtu, {0}, {0} },
{ "mtu", MOD_PROTO_IPV6, NULL, ip_get_mtu, {0}, {0} },
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 47972a8c1a..96a0457678 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
{
espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
"net", KSTAT_TYPE_NAMED,
- sizeof (esp_kstats_t) / sizeof (kstat_named_t),
- KSTAT_FLAG_PERSISTENT, stackid);
+ sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index a564376cfb..706752b236 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -269,6 +269,7 @@ struct ip_stack {
uint_t ips_dce_hashsize;
struct dcb_s *ips_dce_hash_v4;
struct dcb_s *ips_dce_hash_v6;
+ uint_t ips_dce_reclaim_needed;
/* pending binds */
mblk_t *ips_ip6_asp_pending_ops;
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
new file mode 100644
index 0000000000..a6a09b043e
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -0,0 +1,1226 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * ipd: Internet packet disturber
+ *
+ * The purpose of ipd is to simulate congested and lossy networks when they
+ * don't actually exist. The features of these congested and lossy networks are
+ * events that end up leading to retransmits and thus kicking us out of the
+ * TCP/IP fastpath. Since normally this would require us to have an actually
+ * congested network, which can be problematic, we instead simulate this
+ * behavior.
+ *
+ * 1. ipd's operations and restrictions
+ *
+ * ipd currently has facilities to cause IP traffic to be:
+ *
+ * - Corrupted with some probability.
+ * - Delayed for a set number of microseconds.
+ * - Dropped with some probability.
+ *
+ * Each of these features are enabled on a per-zone basic. The current
+ * implementation restricts this specifically to exclusive stack zones.
+ * Enabling ipd on a given zone causes pfhooks to be installed for that zone's
+ * netstack. Because of the nature of ipd, it currently only supports exclusive
+ * stack zones and as a further restriction, it only allows the global zone
+ * administrative access. ipd can be enabled for the global zone, but doing so
+ * will cause all shared-stack zones to also be affected.
+ *
+ * 2. General architecture and Locking
+ *
+ * ipd consists of a few components. There is a per netstack data structure that
+ * is created and destroyed with the creation and destruction of each exclusive
+ * stack zone. Each of these netstacks is stored in a global list which is
+ * accessed for control of ipd via ioctls. The following diagram touches on the
+ * data structures that are used throughout ipd.
+ *
+ * ADMINISTRATIVE DATA PATH
+ *
+ * +--------+ +------+ +------+
+ * | ipdadm | | ip | | nics |
+ * +--------+ +------+ +------+
+ * | ^ | |
+ * | | ioctl(2) | |
+ * V | V V
+ * +----------+ +-------------------------+
+ * | /dev/ipd | | pfhooks packet callback | == ipd_hook()
+ * +----------+ +-------------------------+
+ * | |
+ * | |
+ * V |
+ * +----------------+ |
+ * | list_t ipd_nsl |------+ |
+ * +----------------+ | |
+ * | |
+ * V per netstack V
+ * +----------------------------+
+ * | ipd_nestack_t |
+ * +----------------------------+
+ *
+ * ipd has two different entry points, one is administrative, the other is the
+ * data path. The administrative path is accessed by a userland component called
+ * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd.
+ * If the administrative path enables a specific zone, then the data path will
+ * become active for that zone. Any packet that leaves that zone's IP stack or
+ * is going to enter it, comes through the callback specified in the hook_t(9S)
+ * structure. This will cause each packet to go through ipd_hook().
+ *
+ * While the locking inside of ipd should be straightforward, unfortunately, the
+ * pfhooks subsystem necessarily complicates this a little bit. There are
+ * currently three different sets of locks in ipd.
+ *
+ * - Global lock N on the netstack list.
+ * - Global lock A on the active count.
+ * - Per-netstack data structure lock Z.
+ *
+ * # Locking rules
+ *
+ * L.1a N must always be acquired first and released last
+ *
+ * If you need to acquire the netstack list lock, either for reading or writing,
+ * then N must be acquired first and before any other locks. It may not be
+ * dropped before any other lock.
+ *
+ * L.1b N must only be acquired from the administrative path and zone creation,
+ * shutdown, and destruct callbacks.
+ *
+ * The data path, e.g. receiving the per-packet callbacks, should never be
+ * grabbing the list lock. If it is, then the architecture here needs to be
+ * reconsidered.
+ *
+ * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks
+ * are active.
+ *
+ * The way the pfhooks subsystem is designed is that a reference count is
+ * present on the hook_t while it is active. As long as that reference count is
+ * non-zero, a call to net_hook_unregister will block until it is lowered.
+ * Because the callbacks want the same lock for the netstack that is held by the
+ * administrative path calling into net_hook_unregister, we deadlock.
+ *
+ * ioctl from ipdadm remove hook_t cb (from nic) hook_t cb (from IP)
+ * ----------------------- -------------------- -------------------
+ * | | |
+ * | bump hook_t refcount |
+ * mutex_enter(ipd_nsl_lock); enter ipd_hook() bump hook_t refcount
+ * mutex acquired mutex_enter(ins->ipdn_lock); |
+ * | mutex acquired enter ipd_hook()
+ * mutex_enter(ins->ipdn_lock); | mutex_enter(ins->ipdn_lock);
+ * | | |
+ * | | |
+ * | mutex_exit(ins->ipdn_lock); |
+ * | | |
+ * mutex acquired leave ipd_hook() |
+ * | decrement hook_t refcount |
+ * | | |
+ * ipd_teardown_hooks() | |
+ * net_hook_unregister() | |
+ * cv_wait() if recount | |
+ * | | |
+ * ---------------------------------------------------------------------------
+ *
+ * At this point, we can see that the second hook callback still doesn't have
+ * the mutex, but it has bumped the hook_t refcount. However, it will never
+ * acquire the mutex that it needs to finish its operation and decrement the
+ * refcount.
+ *
+ * Obviously, deadlocking is not acceptable, thus the following corollary to the
+ * second locking rule:
+ *
+ * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem,
+ * N must be held.
+ *
+ * There is currently only one path where we have to worry about this. That is
+ * when we are removing a hook, but the zone is not being shutdown, then hooks
+ * are currently active. The only place that this currently happens is in
+ * ipd_check_hooks().
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/kstat.h>
+#include <sys/neti.h>
+#include <sys/list.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/model.h>
+#include <sys/strsun.h>
+
+#include <sys/netstack.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include <sys/ipd.h>
+
+#define IPDN_STATUS_DISABLED 0x1
+#define IPDN_STATUS_ENABLED 0x2
+#define IPDN_STATUS_CONDEMNED 0x4
+
+/*
+ * These flags are used to determine whether or not the hooks are registered.
+ */
+#define IPDN_HOOK_NONE 0x0
+#define IPDN_HOOK_V4IN 0x1
+#define IPDN_HOOK_V4OUT 0x2
+#define IPDN_HOOK_V6IN 0x4
+#define IPDN_HOOK_V6OUT 0x8
+#define IPDN_HOOK_ALL 0xf
+
+/*
+ * Per-netstack kstats.
+ */
+typedef struct ipd_nskstat {
+ kstat_named_t ink_ndrops;
+ kstat_named_t ink_ncorrupts;
+ kstat_named_t ink_ndelays;
+} ipd_nskstat_t;
+
+/*
+ * Different parts of this structure have different locking semantics. The list
+ * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock.
+ * The following members are read only: ipdn_netid and ipdn_zoneid. The members
+ * of the kstat structure are always accessible in the data path, but the
+ * counters must be bumped with atomic operations. The ipdn_lock protects every
+ * other aspect of this structure. Please see the big theory statement on the
+ * requirements for lock ordering.
+ */
+typedef struct ipd_netstack {
+ list_node_t ipdn_link; /* link on ipd_nsl */
+ netid_t ipdn_netid; /* netstack id */
+ zoneid_t ipdn_zoneid; /* zone id */
+ kstat_t *ipdn_kstat; /* kstat_t ptr */
+ ipd_nskstat_t ipdn_ksdata; /* kstat data */
+ kmutex_t ipdn_lock; /* protects following members */
+ int ipdn_status; /* status flags */
+ net_handle_t ipdn_v4hdl; /* IPv4 net handle */
+ net_handle_t ipdn_v6hdl; /* IPv4 net handle */
+ int ipdn_hooked; /* are hooks registered */
+ hook_t *ipdn_v4in; /* IPv4 traffic in hook */
+ hook_t *ipdn_v4out; /* IPv4 traffice out hook */
+ hook_t *ipdn_v6in; /* IPv6 traffic in hook */
+ hook_t *ipdn_v6out; /* IPv6 traffic out hook */
+ int ipdn_enabled; /* which perturbs are on */
+ int ipdn_corrupt; /* corrupt percentage */
+ int ipdn_drop; /* drop percentage */
+ uint_t ipdn_delay; /* delay us */
+ long ipdn_rand; /* random seed */
+} ipd_netstack_t;
+
+/*
+ * ipd internal variables
+ */
+static dev_info_t *ipd_devi; /* device info */
+static net_instance_t *ipd_neti; /* net_instance for hooks */
+static unsigned int ipd_max_delay = IPD_MAX_DELAY; /* max delay in us */
+static kmutex_t ipd_nsl_lock; /* lock for the nestack list */
+static list_t ipd_nsl; /* list of netstacks */
+static kmutex_t ipd_nactive_lock; /* lock for nactive */
+static unsigned int ipd_nactive; /* number of active netstacks */
+static int ipd_nactive_fudge = 4; /* amount to fudge by in list */
+
+/*
+ * Note that this random number implementation is based upon the old BSD 4.1
+ * rand. It's good enough for us!
+ */
+static int
+ipd_nextrand(ipd_netstack_t *ins)
+{
+ ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345;
+ return (ins->ipdn_rand & 0x7fffffff);
+}
+
+static void
+ipd_ksbump(kstat_named_t *nkp)
+{
+ atomic_inc_64(&nkp->value.ui64);
+}
+
+/*
+ * This is where all the magic actually happens. The way that this works is we
+ * grab the ins lock to basically get a copy of all the data that we need to do
+ * our job and then let it go to minimize contention. In terms of actual work on
+ * the packet we do them in the following order:
+ *
+ * - drop
+ * - delay
+ * - corrupt
+ */
+/*ARGSUSED*/
+static int
+ipd_hook(hook_event_token_t event, hook_data_t data, void *arg)
+{
+ unsigned char *crp;
+ int dwait, corrupt, drop, rand, off, status;
+ mblk_t *mbp;
+ ipd_netstack_t *ins = arg;
+ hook_pkt_event_t *pkt = (hook_pkt_event_t *)data;
+
+ mutex_enter(&ins->ipdn_lock);
+ status = ins->ipdn_status;
+ dwait = ins->ipdn_delay;
+ corrupt = ins->ipdn_corrupt;
+ drop = ins->ipdn_drop;
+ rand = ipd_nextrand(ins);
+ mutex_exit(&ins->ipdn_lock);
+
+ /*
+ * This probably cannot happen, but we'll do an extra guard just in
+ * case.
+ */
+ if (status & IPDN_STATUS_CONDEMNED)
+ return (0);
+
+ if (drop != 0 && rand % 100 < drop) {
+ freemsg(*pkt->hpe_mp);
+ *pkt->hpe_mp = NULL;
+ pkt->hpe_mb = NULL;
+ pkt->hpe_hdr = NULL;
+ ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops);
+
+ return (1);
+ }
+
+ if (dwait != 0) {
+ if (dwait < TICK_TO_USEC(1))
+ drv_usecwait(dwait);
+ else
+ delay(drv_usectohz(dwait));
+ ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays);
+ }
+
+ if (corrupt != 0 && rand % 100 < corrupt) {
+ /*
+ * Since we're corrupting the mblk, just corrupt everything in
+ * the chain. While we could corrupt the entire packet, that's a
+ * little strong. Instead we're going to just change one of the
+ * bytes in each mblock.
+ */
+ mbp = *pkt->hpe_mp;
+ while (mbp != NULL) {
+ if (mbp->b_wptr == mbp->b_rptr)
+ continue;
+
+ /*
+ * While pfhooks probably won't send us anything else,
+ * let's just be extra careful. The stack probably isn't
+ * as resiliant to corruption of control messages.
+ */
+ if (DB_TYPE(mbp) != M_DATA)
+ continue;
+
+ off = rand % ((uintptr_t)mbp->b_wptr -
+ (uintptr_t)mbp->b_rptr);
+ crp = mbp->b_rptr + off;
+ off = rand % 8;
+ *crp = *crp ^ (1 << off);
+
+ mbp = mbp->b_cont;
+ }
+ ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts);
+ }
+
+ return (0);
+}
+
+/*
+ * Sets up and registers all the proper hooks needed for the netstack to capture
+ * packets. Callers are assumed to already be holding the ipd_netstack_t's lock.
+ * If there is a failure in setting something up, it is the responsibility of
+ * this function to clean it up. Once this function has been called, it should
+ * not be called until a corresponding call to tear down the hooks has been
+ * done.
+ */
+static int
+ipd_setup_hooks(ipd_netstack_t *ins)
+{
+ ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+ ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET);
+ if (ins->ipdn_v4hdl == NULL)
+ goto cleanup;
+
+ ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6);
+ if (ins->ipdn_v6hdl == NULL)
+ goto cleanup;
+
+ ins->ipdn_v4in = hook_alloc(HOOK_VERSION);
+ if (ins->ipdn_v4in == NULL)
+ goto cleanup;
+
+ ins->ipdn_v4in->h_flags = 0;
+ ins->ipdn_v4in->h_hint = HH_NONE;
+ ins->ipdn_v4in->h_hintvalue = 0;
+ ins->ipdn_v4in->h_func = ipd_hook;
+ ins->ipdn_v4in->h_arg = ins;
+ ins->ipdn_v4in->h_name = "ipd IPv4 in";
+
+ if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v4in) != 0)
+ goto cleanup;
+ ins->ipdn_hooked |= IPDN_HOOK_V4IN;
+
+ ins->ipdn_v4out = hook_alloc(HOOK_VERSION);
+ if (ins->ipdn_v4out == NULL)
+ goto cleanup;
+ ins->ipdn_v4out->h_flags = 0;
+ ins->ipdn_v4out->h_hint = HH_NONE;
+ ins->ipdn_v4out->h_hintvalue = 0;
+ ins->ipdn_v4out->h_func = ipd_hook;
+ ins->ipdn_v4out->h_arg = ins;
+ ins->ipdn_v4out->h_name = "ipd IPv4 out";
+
+ if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v4out) != 0)
+ goto cleanup;
+ ins->ipdn_hooked |= IPDN_HOOK_V4OUT;
+
+ ins->ipdn_v6in = hook_alloc(HOOK_VERSION);
+ if (ins->ipdn_v6in == NULL)
+ goto cleanup;
+ ins->ipdn_v6in->h_flags = 0;
+ ins->ipdn_v6in->h_hint = HH_NONE;
+ ins->ipdn_v6in->h_hintvalue = 0;
+ ins->ipdn_v6in->h_func = ipd_hook;
+ ins->ipdn_v6in->h_arg = ins;
+ ins->ipdn_v6in->h_name = "ipd IPv6 in";
+
+ if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v6in) != 0)
+ goto cleanup;
+ ins->ipdn_hooked |= IPDN_HOOK_V6IN;
+
+ ins->ipdn_v6out = hook_alloc(HOOK_VERSION);
+ if (ins->ipdn_v6out == NULL)
+ goto cleanup;
+ ins->ipdn_v6out->h_flags = 0;
+ ins->ipdn_v6out->h_hint = HH_NONE;
+ ins->ipdn_v6out->h_hintvalue = 0;
+ ins->ipdn_v6out->h_func = ipd_hook;
+ ins->ipdn_v6out->h_arg = ins;
+ ins->ipdn_v6out->h_name = "ipd IPv6 out";
+
+ if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v6out) != 0)
+ goto cleanup;
+ ins->ipdn_hooked |= IPDN_HOOK_V6OUT;
+ mutex_enter(&ipd_nactive_lock);
+ ipd_nactive++;
+ mutex_exit(&ipd_nactive_lock);
+
+ return (0);
+
+cleanup:
+ if (ins->ipdn_hooked & IPDN_HOOK_V6OUT)
+ (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v6out);
+
+ if (ins->ipdn_hooked & IPDN_HOOK_V6IN)
+ (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v6in);
+
+ if (ins->ipdn_hooked & IPDN_HOOK_V4OUT)
+ (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v4out);
+
+ if (ins->ipdn_hooked & IPDN_HOOK_V4IN)
+ (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v4in);
+
+ ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+ if (ins->ipdn_v6out != NULL)
+ hook_free(ins->ipdn_v6out);
+
+ if (ins->ipdn_v6in != NULL)
+ hook_free(ins->ipdn_v6in);
+
+ if (ins->ipdn_v4out != NULL)
+ hook_free(ins->ipdn_v4out);
+
+ if (ins->ipdn_v4in != NULL)
+ hook_free(ins->ipdn_v4in);
+
+ if (ins->ipdn_v6hdl != NULL)
+ (void) net_protocol_release(ins->ipdn_v6hdl);
+
+ if (ins->ipdn_v4hdl != NULL)
+ (void) net_protocol_release(ins->ipdn_v4hdl);
+
+ return (1);
+}
+
+static void
+ipd_teardown_hooks(ipd_netstack_t *ins)
+{
+ ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL);
+ VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v6out) == 0);
+ VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v6in) == 0);
+ VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+ ins->ipdn_v4out) == 0);
+ VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+ ins->ipdn_v4in) == 0);
+
+ ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+ hook_free(ins->ipdn_v6out);
+ hook_free(ins->ipdn_v6in);
+ hook_free(ins->ipdn_v4out);
+ hook_free(ins->ipdn_v4in);
+
+ VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0);
+ VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0);
+
+ mutex_enter(&ipd_nactive_lock);
+ ipd_nactive--;
+ mutex_exit(&ipd_nactive_lock);
+}
+
+static int
+ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable)
+{
+ int olden, rval;
+ olden = ins->ipdn_enabled;
+
+ if (enable)
+ ins->ipdn_enabled |= type;
+ else
+ ins->ipdn_enabled &= ~type;
+
+ /*
+ * If hooks were previously enabled.
+ */
+ if (olden == 0 && ins->ipdn_enabled != 0) {
+ rval = ipd_setup_hooks(ins);
+ if (rval != 0) {
+ ins->ipdn_enabled &= ~type;
+ ASSERT(ins->ipdn_enabled == 0);
+ return (rval);
+ }
+
+ return (0);
+ }
+
+ if (olden != 0 && ins->ipdn_enabled == 0) {
+ ASSERT(olden != 0);
+
+ /*
+ * We have to drop the lock here, lest we cause a deadlock.
+ * Unfortunately, there may be hooks that are running and are
+ * actively in flight and we have to call the unregister
+ * function. Due to the hooks framework, if there is an inflight
+ * hook (most likely right now), and we are holding the
+ * netstack's lock, those hooks will never return. This is
+ * unfortunate.
+ *
+ * Because we only come into this path holding the list lock, we
+ * know that only way that someone else can come in and get to
+ * this structure is via the hook callbacks which are going to
+ * only be doing reads. They'll also see that everything has
+ * been disabled and return. So while this is unfortunate, it
+ * should be relatively safe.
+ */
+ mutex_exit(&ins->ipdn_lock);
+ ipd_teardown_hooks(ins);
+ mutex_enter(&ins->ipdn_lock);
+ return (0);
+ }
+
+ /*
+ * Othwerise, nothing should have changed here.
+ */
+ ASSERT((olden == 0) == (ins->ipdn_enabled == 0));
+ return (0);
+}
+
+static int
+ipd_toggle_corrupt(ipd_netstack_t *ins, int percent)
+{
+ int rval;
+
+ ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+ if (percent < 0 || percent > 100)
+ return (ERANGE);
+
+ /*
+ * If we've been asked to set the value to a value that we already have,
+ * great, then we're done.
+ */
+ if (percent == ins->ipdn_corrupt)
+ return (0);
+
+ ins->ipdn_corrupt = percent;
+ rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0);
+
+ /*
+ * If ipd_check_hooks_failed, that must mean that we failed to set up
+ * the hooks, so we are going to effectively zero out and fail the
+ * request to enable corruption.
+ */
+ if (rval != 0)
+ ins->ipdn_corrupt = 0;
+
+ return (rval);
+}
+
+static int
+ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
+{
+ int rval;
+
+ ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+ if (delay > ipd_max_delay)
+ return (ERANGE);
+
+ /*
+ * If we've been asked to set the value to a value that we already have,
+ * great, then we're done.
+ */
+ if (delay == ins->ipdn_delay)
+ return (0);
+
+ ins->ipdn_delay = delay;
+ rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0);
+
+ /*
+ * If ipd_check_hooks_failed, that must mean that we failed to set up
+ * the hooks, so we are going to effectively zero out and fail the
+ * request to enable corruption.
+ */
+ if (rval != 0)
+ ins->ipdn_delay = 0;
+
+ return (rval);
+}
+static int
+ipd_toggle_drop(ipd_netstack_t *ins, int percent)
+{
+ int rval;
+
+ ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+ if (percent < 0 || percent > 100)
+ return (ERANGE);
+
+ /*
+ * If we've been asked to set the value to a value that we already have,
+ * great, then we're done.
+ */
+ if (percent == ins->ipdn_drop)
+ return (0);
+
+ ins->ipdn_drop = percent;
+ rval = ipd_check_hooks(ins, IPD_DROP, percent != 0);
+
+ /*
+ * If ipd_check_hooks_failed, that must mean that we failed to set up
+ * the hooks, so we are going to effectively zero out and fail the
+ * request to enable corruption.
+ */
+ if (rval != 0)
+ ins->ipdn_drop = 0;
+
+ return (rval);
+}
+
+static int
+ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd)
+{
+ zoneid_t zid;
+ ipd_netstack_t *ins;
+ int rval = 0;
+
+ /*
+ * If the zone that we're coming from is not the GZ, then we ignore it
+ * completely and then instead just set the zoneid to be that of the
+ * caller. If the zoneid is that of the GZ, then we don't touch this
+ * value.
+ */
+ zid = crgetzoneid(cr);
+ if (zid != GLOBAL_ZONEID)
+ ipi->ipip_zoneid = zid;
+
+ if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+ zid != GLOBAL_ZONEID)
+ return (EPERM);
+
+ /*
+ * We need to hold the ipd_nsl_lock throughout the entire operation,
+ * otherwise someone else could come in and remove us from the list and
+ * free us, e.g. the netstack destroy handler. By holding the lock, we
+ * stop it from being able to do anything wrong.
+ */
+ mutex_enter(&ipd_nsl_lock);
+ for (ins = list_head(&ipd_nsl); ins != NULL;
+ ins = list_next(&ipd_nsl, ins)) {
+ if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+ break;
+ }
+
+ if (ins == NULL) {
+ mutex_exit(&ipd_nsl_lock);
+ return (EINVAL);
+ }
+
+ mutex_enter(&ins->ipdn_lock);
+
+ if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+ rval = ESHUTDOWN;
+ goto cleanup;
+ }
+
+ switch (cmd) {
+ case IPDIOC_CORRUPT:
+ rval = ipd_toggle_corrupt(ins, ipi->ipip_arg);
+ break;
+ case IPDIOC_DELAY:
+ rval = ipd_toggle_delay(ins, ipi->ipip_arg);
+ break;
+ case IPDIOC_DROP:
+ rval = ipd_toggle_drop(ins, ipi->ipip_arg);
+ break;
+ }
+
+cleanup:
+ mutex_exit(&ins->ipdn_lock);
+ mutex_exit(&ipd_nsl_lock);
+ return (rval);
+}
+
+static int
+ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr)
+{
+ zoneid_t zid;
+ ipd_netstack_t *ins;
+ int rval = 0;
+
+ /*
+ * See ipd_ioctl_perturb for the rational here.
+ */
+ zid = crgetzoneid(cr);
+ if (zid != GLOBAL_ZONEID)
+ ipi->ipip_zoneid = zid;
+
+ if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+ zid != GLOBAL_ZONEID)
+ return (EPERM);
+
+ mutex_enter(&ipd_nsl_lock);
+ for (ins = list_head(&ipd_nsl); ins != NULL;
+ ins = list_next(&ipd_nsl, ins)) {
+ if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+ break;
+ }
+
+ if (ins == NULL) {
+ mutex_exit(&ipd_nsl_lock);
+ return (EINVAL);
+ }
+
+ mutex_enter(&ins->ipdn_lock);
+
+ /*
+ * If this is condemned, that means it's very shortly going to be torn
+ * down. In that case, there's no reason to actually do anything here,
+ * as it will all be done rather shortly in the destroy function.
+ * Furthermore, because condemned corresponds with it having hit
+ * shutdown, we know that no more packets can be received by this
+ * netstack. All this translates to a no-op.
+ */
+ if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+ rval = 0;
+ goto cleanup;
+ }
+
+ rval = EINVAL;
+ /*
+ * Go through and disable the requested pieces. We can safely ignore the
+ * return value of ipd_check_hooks because the removal case should never
+ * fail, we verify that in the hook teardown case.
+ */
+ if (ipi->ipip_arg & IPD_CORRUPT) {
+ ins->ipdn_corrupt = 0;
+ (void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE);
+ rval = 0;
+ }
+
+ if (ipi->ipip_arg & IPD_DELAY) {
+ ins->ipdn_delay = 0;
+ (void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE);
+ rval = 0;
+ }
+
+ if (ipi->ipip_arg & IPD_DROP) {
+ ins->ipdn_drop = 0;
+ (void) ipd_check_hooks(ins, IPD_DROP, B_FALSE);
+ rval = 0;
+ }
+
+cleanup:
+ mutex_exit(&ins->ipdn_lock);
+ mutex_exit(&ipd_nsl_lock);
+ return (rval);
+}
+
+static int
+ipd_ioctl_info(ipd_ioc_info_t *ipi, cred_t *cr)
+{
+ zoneid_t zid;
+ ipd_netstack_t *ins;
+
+ /*
+ * See ipd_ioctl_perturb for the rational here.
+ */
+ zid = crgetzoneid(cr);
+ if (zid != GLOBAL_ZONEID)
+ ipi->ipii_zoneid = zid;
+
+ if (zoneid_to_netstackid(ipi->ipii_zoneid) == GLOBAL_NETSTACKID &&
+ zid != GLOBAL_ZONEID)
+ return (EPERM);
+
+ mutex_enter(&ipd_nsl_lock);
+ for (ins = list_head(&ipd_nsl); ins != NULL;
+ ins = list_next(&ipd_nsl, ins)) {
+ if (ins->ipdn_zoneid == ipi->ipii_zoneid)
+ break;
+ }
+
+ if (ins == NULL) {
+ mutex_exit(&ipd_nsl_lock);
+ return (EINVAL);
+ }
+
+ mutex_enter(&ins->ipdn_lock);
+ ipi->ipii_corrupt = ins->ipdn_corrupt;
+ ipi->ipii_delay = ins->ipdn_delay;
+ ipi->ipii_drop = ins->ipdn_drop;
+ mutex_exit(&ins->ipdn_lock);
+ mutex_exit(&ipd_nsl_lock);
+
+ return (0);
+}
+
+/*
+ * When this function is called, the value of the ipil_nzones argument controls
+ * how this function works. When called with a value of zero, then we treat that
+ * as the caller asking us what's a reasonable number of entries for me to
+ * allocate memory for. If the zone is the global zone, then we tell them how
+ * many folks are currently active and add a fudge factor. Otherwise the answer
+ * is always one.
+ *
+ * In the non-zero case, we give them that number of zone ids. While this isn't
+ * quite ideal as it might mean that someone misses something, this generally
+ * won't be an issue, as it involves a rather tight race condition in the
+ * current ipdadm implementation.
+ */
+static int
+ipd_ioctl_list(intptr_t arg, cred_t *cr)
+{
+ zoneid_t zid;
+ zoneid_t *zoneids;
+ ipd_netstack_t *ins;
+ uint_t nzoneids, rzids, cur;
+ int rval = 0;
+ STRUCT_DECL(ipd_ioc_list, h);
+
+ STRUCT_INIT(h, get_udatamodel());
+ if (ddi_copyin((void *)arg, STRUCT_BUF(h),
+ STRUCT_SIZE(h), 0) != 0)
+ return (EFAULT);
+
+ zid = crgetzoneid(cr);
+
+ rzids = STRUCT_FGET(h, ipil_nzones);
+ if (rzids == 0) {
+ if (zid == GLOBAL_ZONEID) {
+ mutex_enter(&ipd_nactive_lock);
+ rzids = ipd_nactive + ipd_nactive_fudge;
+ mutex_exit(&ipd_nactive_lock);
+ } else {
+ rzids = 1;
+ }
+ STRUCT_FSET(h, ipil_nzones, rzids);
+ if (ddi_copyout(STRUCT_BUF(h), (void *)arg,
+ STRUCT_SIZE(h), 0) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ mutex_enter(&ipd_nsl_lock);
+ if (zid == GLOBAL_ZONEID) {
+ nzoneids = ipd_nactive;
+ } else {
+ nzoneids = 1;
+ }
+
+ zoneids = kmem_alloc(sizeof (zoneid_t) * nzoneids, KM_SLEEP);
+ cur = 0;
+ for (ins = list_head(&ipd_nsl); ins != NULL;
+ ins = list_next(&ipd_nsl, ins)) {
+ if (ins->ipdn_enabled == 0)
+ continue;
+
+ if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) {
+ zoneids[cur++] = ins->ipdn_zoneid;
+ }
+
+ if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid)
+ break;
+ }
+ ASSERT(cur == nzoneids);
+ mutex_exit(&ipd_nsl_lock);
+
+ STRUCT_FSET(h, ipil_nzones, nzoneids);
+ if (nzoneids < rzids)
+ rzids = nzoneids;
+ if (ddi_copyout(zoneids, STRUCT_FGETP(h, ipil_list),
+ nzoneids * sizeof (zoneid_t), NULL) != 0)
+ rval = EFAULT;
+
+ kmem_free(zoneids, sizeof (zoneid_t) * nzoneids);
+ if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0)
+ return (EFAULT);
+
+ return (rval);
+}
+
+static void *
+ipd_nin_create(const netid_t id)
+{
+ ipd_netstack_t *ins;
+ ipd_nskstat_t *ink;
+
+ ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP);
+ ins->ipdn_status = IPDN_STATUS_DISABLED;
+ ins->ipdn_netid = id;
+ ins->ipdn_zoneid = netstackid_to_zoneid(id);
+ ins->ipdn_rand = gethrtime();
+ mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL);
+
+ ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid,
+ "ipd", "net", KSTAT_TYPE_NAMED,
+ sizeof (ipd_nskstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ins->ipdn_kstat != NULL) {
+ if (ins->ipdn_zoneid != GLOBAL_ZONEID)
+ kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID);
+
+ ink = &ins->ipdn_ksdata;
+ ins->ipdn_kstat->ks_data = ink;
+ kstat_named_init(&ink->ink_ncorrupts, "corrupts",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64);
+ kstat_named_init(&ink->ink_ndelays, "delays",
+ KSTAT_DATA_UINT64);
+ kstat_install(ins->ipdn_kstat);
+ }
+
+ mutex_enter(&ipd_nsl_lock);
+ list_insert_tail(&ipd_nsl, ins);
+ mutex_exit(&ipd_nsl_lock);
+
+ return (ins);
+}
+
+static void
+ipd_nin_shutdown(const netid_t id, void *arg)
+{
+ ipd_netstack_t *ins = arg;
+
+ VERIFY(id == ins->ipdn_netid);
+ mutex_enter(&ins->ipdn_lock);
+ ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED ||
+ ins->ipdn_status == IPDN_STATUS_ENABLED);
+ ins->ipdn_status |= IPDN_STATUS_CONDEMNED;
+ if (ins->ipdn_kstat != NULL)
+ net_kstat_delete(id, ins->ipdn_kstat);
+ mutex_exit(&ins->ipdn_lock);
+}
+
+/*ARGSUSED*/
+static void
+ipd_nin_destroy(const netid_t id, void *arg)
+{
+ ipd_netstack_t *ins = arg;
+
+ /*
+ * At this point none of the hooks should be able to fire because the
+ * zone has been shutdown and we are in the process of destroying it.
+ * Thus it should not be possible for someone else to come in and grab
+ * our ipd_netstack_t for this zone. Because of that, we know that we
+ * are the only ones who could be running here.
+ */
+ mutex_enter(&ipd_nsl_lock);
+ list_remove(&ipd_nsl, ins);
+ mutex_exit(&ipd_nsl_lock);
+
+ if (ins->ipdn_hooked)
+ ipd_teardown_hooks(ins);
+ mutex_destroy(&ins->ipdn_lock);
+ kmem_free(ins, sizeof (ipd_netstack_t));
+}
+
+/*ARGSUSED*/
+static int
+ipd_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+ if (flag & FEXCL || flag & FNDELAY)
+ return (EINVAL);
+
+ if (otype != OTYP_CHR)
+ return (EINVAL);
+
+ if (!(flag & FREAD && flag & FWRITE))
+ return (EINVAL);
+
+ if (secpolicy_ip_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+ int rval;
+ ipd_ioc_perturb_t ipip;
+ ipd_ioc_info_t ipii;
+
+ switch (cmd) {
+ case IPDIOC_CORRUPT:
+ case IPDIOC_DELAY:
+ case IPDIOC_DROP:
+ if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+ 0) != 0)
+ return (EFAULT);
+ rval = ipd_ioctl_perturb(&ipip, cr, cmd);
+ return (rval);
+ case IPDIOC_REMOVE:
+ if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+ 0) != 0)
+ return (EFAULT);
+ rval = ipd_ioctl_remove(&ipip, cr);
+ return (rval);
+ case IPDIOC_LIST:
+ /*
+ * Because the list ioctl doesn't have a fixed-size struct due
+ * to needing to pass around a pointer, we instead delegate the
+ * copyin logic to the list code.
+ */
+ return (ipd_ioctl_list(arg, cr));
+ case IPDIOC_INFO:
+ if (ddi_copyin((void *)arg, &ipii, sizeof (ipd_ioc_info_t),
+ 0) != 0)
+ return (EFAULT);
+ rval = ipd_ioctl_info(&ipii, cr);
+ if (rval != 0)
+ return (rval);
+ if (ddi_copyout(&ipii, (void *)arg, sizeof (ipd_ioc_info_t),
+ 0) != 0)
+ return (EFAULT);
+ return (0);
+ default:
+ break;
+ }
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+ipd_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+ return (0);
+}
+
+static int
+ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ minor_t instance;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (ipd_devi != NULL)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance,
+ DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ ipd_neti = net_instance_alloc(NETINFO_VERSION);
+ if (ipd_neti == NULL) {
+ ddi_remove_minor_node(dip, NULL);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Note that these global structures MUST be initialized before we call
+ * net_instance_register, as that will instantly cause us to drive into
+ * the ipd_nin_create callbacks.
+ */
+ list_create(&ipd_nsl, sizeof (ipd_netstack_t),
+ offsetof(ipd_netstack_t, ipdn_link));
+ mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL);
+
+ /* Note, net_instance_alloc sets the version. */
+ ipd_neti->nin_name = "ipd";
+ ipd_neti->nin_create = ipd_nin_create;
+ ipd_neti->nin_destroy = ipd_nin_destroy;
+ ipd_neti->nin_shutdown = ipd_nin_shutdown;
+ if (net_instance_register(ipd_neti) == DDI_FAILURE) {
+ net_instance_free(ipd_neti);
+ ddi_remove_minor_node(dip, NULL);
+ }
+
+ ddi_report_dev(dip);
+ ipd_devi = dip;
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = ipd_devi;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)getminor((dev_t)arg);
+ error = DDI_SUCCESS;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ ASSERT(dip == ipd_devi);
+ ddi_remove_minor_node(dip, NULL);
+ ipd_devi = NULL;
+
+ if (ipd_neti != NULL) {
+ VERIFY(net_instance_unregister(ipd_neti) == 0);
+ net_instance_free(ipd_neti);
+ }
+
+ mutex_destroy(&ipd_nsl_lock);
+ mutex_destroy(&ipd_nactive_lock);
+ list_destroy(&ipd_nsl);
+
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops ipd_cb_ops = {
+ ipd_open, /* open */
+ ipd_close, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ ipd_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* streamtab */
+ D_NEW | D_MP, /* Driver compatibility flag */
+ CB_REV, /* rev */
+ nodev, /* aread */
+ nodev /* awrite */
+};
+
+static struct dev_ops ipd_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ ipd_getinfo, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ ipd_attach, /* attach */
+ ipd_detach, /* detach */
+ nodev, /* reset */
+ &ipd_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ "Internet packet disturber",
+ &ipd_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ { (void *)&modldrv, NULL }
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/inet/ipd/ipd.conf b/usr/src/uts/common/inet/ipd/ipd.conf
new file mode 100644
index 0000000000..83b9b685f4
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+name="ipd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index 98cda0b7cc..75bac21ae4 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -141,11 +141,13 @@ ipf_stack_t *ifs;
#define UNDO_HOOK(_f, _b, _e, _h) \
do { \
+ int tmp; \
if (ifs->_f != NULL) { \
if (ifs->_b) { \
- ifs->_b = (net_hook_unregister(ifs->_f, \
- _e, ifs->_h) != 0); \
- if (!ifs->_b) { \
+ tmp = net_hook_unregister(ifs->_f, \
+ _e, ifs->_h); \
+ ifs->_b = (tmp != 0 && tmp != ENXIO); \
+ if (!ifs->_b && ifs->_h != NULL) { \
hook_free(ifs->_h); \
ifs->_h = NULL; \
} \
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 6d0bf70b2a..2e08dc359b 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,6 +23,10 @@
*/
/*
+ * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+/*
* Squeues: General purpose serialization mechanism
* ------------------------------------------------
*
@@ -120,6 +124,8 @@
#include <sys/sdt.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
#include <inet/ipclassifier.h>
#include <inet/udp_impl.h>
@@ -142,6 +148,9 @@ int squeue_workerwait_ms = 0;
static int squeue_drain_ns = 0;
static int squeue_workerwait_tick = 0;
+uintptr_t squeue_drain_stack_needed = 10240;
+uint_t squeue_drain_stack_toodeep;
+
#define MAX_BYTES_TO_PICKUP 150000
#define ENQUEUE_CHAIN(sqp, mp, tail, cnt) { \
@@ -546,6 +555,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
ASSERT(MUTEX_HELD(&sqp->sq_lock));
ASSERT(sqp->sq_first != NULL);
now = gethrtime();
+ sqp->sq_run = curthread;
sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
/*
@@ -711,6 +721,20 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
boolean_t sq_poll_capable;
ip_recv_attr_t *ira, iras;
+ /*
+ * Before doing any work, check our stack depth; if we're not a
+ * worker thread for this squeue and we're beginning to get tight on
+ * on stack, kick the worker, bump a counter and return.
+ */
+ if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() -
+ (uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) {
+ ASSERT(mutex_owned(&sqp->sq_lock));
+ sqp->sq_awaken = ddi_get_lbolt();
+ cv_signal(&sqp->sq_worker_cv);
+ squeue_drain_stack_toodeep++;
+ return;
+ }
+
sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
again:
ASSERT(mutex_owned(&sqp->sq_lock));
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 1bb87e5c56..f79427e766 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -3792,7 +3792,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
ASSERT(error == 0);
tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
- cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3857,7 +3858,8 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
freeb(tcps->tcps_ixa_cleanup_mp);
tcps->tcps_ixa_cleanup_mp = NULL;
- cv_destroy(&tcps->tcps_ixa_cleanup_cv);
+ cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv);
+ cv_destroy(&tcps->tcps_ixa_cleanup_done_cv);
mutex_destroy(&tcps->tcps_ixa_cleanup_lock);
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index 82fc0b227c..3cc5627b27 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,12 +21,14 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/tihdr.h>
#include <sys/policy.h>
#include <sys/tsol/tnet.h>
+#include <sys/kstat.h>
#include <inet/common.h>
#include <inet/ip.h>
@@ -505,7 +507,7 @@ tcp_kstat_init(netstackid_t stackid)
{ "connTableSize6", KSTAT_DATA_INT32, 0 }
};
- ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2",
+ ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, TCP_MOD_NAME, "mib2",
KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid);
if (ksp == NULL)
@@ -518,6 +520,13 @@ tcp_kstat_init(netstackid_t stackid)
ksp->ks_update = tcp_kstat_update;
ksp->ks_private = (void *)(uintptr_t)stackid;
+ /*
+ * If this is an exclusive netstack for a local zone, the global zone
+ * should still be able to read the kstat.
+ */
+ if (stackid != GLOBAL_NETSTACKID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
kstat_install(ksp);
return (ksp);
}
@@ -733,7 +742,7 @@ tcp_kstat2_init(netstackid_t stackid)
#endif
};
- ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net",
+ ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, "tcpstat", "net",
KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 0,
stackid);
@@ -744,6 +753,13 @@ tcp_kstat2_init(netstackid_t stackid)
ksp->ks_private = (void *)(uintptr_t)stackid;
ksp->ks_update = tcp_kstat2_update;
+ /*
+ * If this is an exclusive netstack for a local zone, the global zone
+ * should still be able to read the kstat.
+ */
+ if (stackid != GLOBAL_NETSTACKID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
kstat_install(ksp);
return (ksp);
}
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 2dccf6b78c..e46ebe08da 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -101,7 +101,8 @@ struct tcp_stack {
/* Used to synchronize access when reclaiming memory */
mblk_t *tcps_ixa_cleanup_mp;
kmutex_t tcps_ixa_cleanup_lock;
- kcondvar_t tcps_ixa_cleanup_cv;
+ kcondvar_t tcps_ixa_cleanup_ready_cv;
+ kcondvar_t tcps_ixa_cleanup_done_cv;
/* Variables for handling kmem reclaim call back. */
kmutex_t tcps_reclaim_lock;
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index 40cbe86170..2152ce0baa 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
*/
/*
@@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
err = EACCES;
goto done;
}
- err = dls_devnet_setzid(dlh, dzp->diz_zid);
+ err = dls_devnet_setzid(dlh, dzp->diz_zid,
+ dzp->diz_transient);
} else {
kprop->pr_perm_flags = MAC_PROP_PERM_RW;
(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
return (err);
if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
- dir->dir_link)) != 0)
+ dir->dir_link, dir->dir_zoneinit)) != 0)
return (err);
if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index f90adbf27a..d35c1e4bbf 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -25,6 +25,10 @@
*/
/*
+ * Copyright 2011 Joyent, Inc. All rights reserved.
+ */
+
+/*
* Data-Link Services Module
*/
@@ -610,6 +614,22 @@ boolean_t
dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
void **ds_rx_arg, boolean_t loopback)
{
+ if (dsp->ds_promisc == 0) {
+ /*
+ * If there are active walkers of the mi_promisc_list when
+ * promiscuousness is disabled, ds_promisc will be cleared,
+ * but the DLS will remain on the mi_promisc_list until the
+ * walk is completed. If we do not recognize this case here,
+ * we won't properly execute the ds_promisc case in the common
+ * accept routine -- and we will potentially accept a packet
+ * that has originated with this DLS (which in turn can
+ * induce recursion and death by stack overflow). If
+ * ds_promisc is zero, we know that we are in this window --
+ * and we refuse to accept the packet.
+ */
+ return (B_FALSE);
+ }
+
return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
loopback));
}
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 049c4bd757..5fa37e0a8a 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/*
@@ -105,12 +106,13 @@ typedef struct dls_devnet_s {
zoneid_t dd_zid; /* current zone */
boolean_t dd_prop_loaded;
taskqid_t dd_prop_taskid;
+ boolean_t dd_transient; /* link goes away when zone does */
} dls_devnet_t;
static int i_dls_devnet_create_iptun(const char *, const char *,
datalink_id_t *);
static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
/*ARGSUSED*/
@@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
dls_devnet_t *ddp;
if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
- (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+ /*
+ * Don't bother moving transient links back to the global zone
+ * since we will simply delete them in dls_devnet_unset.
+ */
+ if (!ddp->dd_transient)
+ (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
dls_devnet_rele_tmp(ddp);
}
return (0);
@@ -526,6 +533,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = getzoneid();
if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
sizeof (retval))) == 0) {
@@ -740,12 +748,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
* Create the "link" kstats.
*/
static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
{
kstat_t *ksp;
+ char *nm;
+ char kname[MAXLINKNAMELEN];
+
+ if (zoneid != newzoneid) {
+ ASSERT(zoneid == GLOBAL_ZONEID);
+ (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+ ddp->dd_linkname);
+ nm = kname;
+ } else {
+ nm = ddp->dd_linkname;
+ }
- if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
- dls_devnet_stat_update, ddp, &ksp) == 0) {
+ if (dls_stat_create("link", 0, nm, zoneid,
+ dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) {
ASSERT(ksp != NULL);
if (zoneid == ddp->dd_owner_zid) {
ASSERT(ddp->dd_ksp == NULL);
@@ -765,12 +784,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
{
if (zoneid == ddp->dd_owner_zid) {
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
} else {
if (ddp->dd_zone_ksp != NULL) {
- kstat_delete(ddp->dd_zone_ksp);
+ dls_stat_delete(ddp->dd_zone_ksp);
ddp->dd_zone_ksp = NULL;
}
}
@@ -781,15 +800,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
* and create the new set using the new name.
*/
static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
{
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
- /* We can't rename a link while it's assigned to a non-global zone. */
+ if (zoneinit && ddp->dd_zone_ksp != NULL) {
+ dls_stat_delete(ddp->dd_zone_ksp);
+ ddp->dd_zone_ksp = NULL;
+ }
+ /*
+ * We can't rename a link while it's assigned to a non-global zone
+ * unless we're first initializing the zone while readying it.
+ */
ASSERT(ddp->dd_zone_ksp == NULL);
- dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+ dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+ (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+ if (zoneinit)
+ dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
}
/*
@@ -878,7 +907,8 @@ done:
rw_exit(&i_dls_devnet_lock);
if (err == 0) {
if (zoneid != GLOBAL_ZONEID &&
- (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
+ (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+ B_FALSE)) != 0)
(void) dls_devnet_unset(macname, &linkid, B_TRUE);
/*
* The kstat subsystem holds its own locks (rather perimeter)
@@ -887,7 +917,7 @@ done:
* lock hierarchy is kstat locks -> i_dls_devnet_lock.
*/
if (stat_create)
- dls_devnet_stat_create(ddp, zoneid);
+ dls_devnet_stat_create(ddp, zoneid, zoneid);
if (ddpp != NULL)
*ddpp = ddp;
}
@@ -924,17 +954,64 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
ASSERT(ddp->dd_ref != 0);
if ((ddp->dd_ref != 1) || (!wait &&
(ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) {
- mutex_exit(&ddp->dd_mutex);
- rw_exit(&i_dls_devnet_lock);
- return (EBUSY);
+ int zstatus = 0;
+
+ /*
+ * There are a couple of alternatives that might be going on
+ * here; a) the zone is shutting down and it has a transient
+ * link assigned, in which case we want to clean it up instead
+ * of moving it back to the global zone, or b) its possible
+ * that we're trying to clean up an orphaned vnic that was
+ * delegated to a zone and which wasn't cleaned up properly
+ * when the zone went away. Check for either of these cases
+ * before we simply return EBUSY.
+ *
+ * zstatus indicates which situation we are dealing with:
+ * 0 - means return EBUSY
+ * 1 - means case (a), cleanup transient link
+ * -1 - means case (b), orphained VNIC
+ */
+ if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+ zone_t *zp;
+
+ if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+ zstatus = -1;
+ } else {
+ if (ddp->dd_transient) {
+ zone_status_t s = zone_status_get(zp);
+
+ if (s >= ZONE_IS_SHUTTING_DOWN)
+ zstatus = 1;
+ }
+ zone_rele(zp);
+ }
+ }
+
+ if (zstatus == 0) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * We want to delete the link, reset ref to 1;
+ */
+ if (zstatus == -1)
+ /* Log a warning, but continue in this case */
+ cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+ ddp->dd_linkname);
+ ddp->dd_ref = 1;
}
ddp->dd_flags |= DD_CONDEMNED;
ddp->dd_ref--;
*id = ddp->dd_linkid;
- if (ddp->dd_zid != GLOBAL_ZONEID)
- (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
+ if (ddp->dd_zid != GLOBAL_ZONEID) {
+ dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+ (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+ B_FALSE);
+ }
/*
* Remove this dls_devnet_t from the hash table.
@@ -1261,9 +1338,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
*
* This case does not change the <link name, linkid> mapping, so the link's
* kstats need to be updated with using name associated the given id2.
+ *
+ * The zonename parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone. Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
*/
int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+ boolean_t zoneinit)
{
dls_dev_handle_t ddh = NULL;
int err = 0;
@@ -1313,13 +1396,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
* is currently accessing the link kstats, or if the link is on-loan
* to a non-global zone. Then set the DD_KSTAT_CHANGING flag to
* prevent any access to the kstats while we delete and recreate
- * kstats below.
+ * kstats below. However, we skip this check if we're renaming the
+ * vnic as part of bringing it up for a zone.
*/
mutex_enter(&ddp->dd_mutex);
- if (ddp->dd_ref > 1) {
- mutex_exit(&ddp->dd_mutex);
- err = EBUSY;
- goto done;
+ if (!zoneinit) {
+ if (ddp->dd_ref > 1) {
+ mutex_exit(&ddp->dd_mutex);
+ err = EBUSY;
+ goto done;
+ }
}
ddp->dd_flags |= DD_KSTAT_CHANGING;
@@ -1333,7 +1419,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
/* rename mac client name and its flow if exists */
if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
goto done;
- (void) mac_rename_primary(mh, link);
+ if (zoneinit) {
+ char tname[MAXLINKNAMELEN];
+
+ (void) snprintf(tname, sizeof (tname), "z%d_%s",
+ ddp->dd_zid, link);
+ (void) mac_rename_primary(mh, tname);
+ } else {
+ (void) mac_rename_primary(mh, link);
+ }
mac_close(mh);
goto done;
}
@@ -1406,7 +1500,7 @@ done:
*/
rw_exit(&i_dls_devnet_lock);
if (err == 0)
- dls_devnet_stat_rename(ddp);
+ dls_devnet_stat_rename(ddp, zoneinit);
if (clear_dd_flag) {
mutex_enter(&ddp->dd_mutex);
@@ -1421,7 +1515,8 @@ done:
}
static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+ boolean_t transient)
{
int err;
mac_perim_handle_t mph;
@@ -1454,6 +1549,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
}
if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
ddp->dd_zid = new_zoneid;
+ ddp->dd_transient = transient;
devnet_need_rebuild = B_TRUE;
}
@@ -1468,7 +1564,7 @@ done:
}
int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
{
dls_devnet_t *ddp;
int err;
@@ -1490,7 +1586,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
refheld = B_TRUE;
}
- if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+ if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
if (refheld)
dls_devnet_rele(ddp);
return (err);
@@ -1507,7 +1603,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
if (old_zid != GLOBAL_ZONEID)
dls_devnet_stat_destroy(ddh, old_zid);
if (new_zid != GLOBAL_ZONEID)
- dls_devnet_stat_create(ddh, new_zid);
+ dls_devnet_stat_create(ddh, new_zid, new_zid);
return (0);
}
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/*
@@ -30,30 +31,33 @@
#include <sys/dld_impl.h>
#include <sys/mac_ether.h>
-static mac_stat_info_t i_dls_si[] = {
- { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
- (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+ kstat_named_t dk_ifspeed;
+ kstat_named_t dk_multircv;
+ kstat_named_t dk_brdcstrcv;
+ kstat_named_t dk_multixmt;
+ kstat_named_t dk_brdcstxmt;
+ kstat_named_t dk_norcvbuf;
+ kstat_named_t dk_ierrors;
+ kstat_named_t dk_noxmtbuf;
+ kstat_named_t dk_oerrors;
+ kstat_named_t dk_collisions;
+ kstat_named_t dk_rbytes;
+ kstat_named_t dk_ipackets;
+ kstat_named_t dk_obytes;
+ kstat_named_t dk_opackets;
+ kstat_named_t dk_rbytes64;
+ kstat_named_t dk_ipackets64;
+ kstat_named_t dk_obytes64;
+ kstat_named_t dk_opackets64;
+ kstat_named_t dk_link_state;
+ kstat_named_t dk_link_duplex;
+ kstat_named_t dk_unknowns;
+ kstat_named_t dk_zonename;
+} dls_kstat_t;
/*
* Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = {
int
dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
{
- kstat_named_t *knp;
- uint_t i;
- uint64_t val;
+ dls_kstat_t *dkp = ksp->ks_data;
if (rw != KSTAT_READ)
return (EACCES);
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
- switch (i_dls_si[i].msi_type) {
- case KSTAT_DATA_UINT64:
- knp->value.ui64 = val;
- break;
- case KSTAT_DATA_UINT32:
- knp->value.ui32 = (uint32_t)val;
- break;
- default:
- ASSERT(B_FALSE);
- }
-
- knp++;
- }
+ dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+ dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIRCV);
+ dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTRCV);
+ dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIXMT);
+ dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTXMT);
+ dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NORCVBUF);
+ dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+ dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NOXMTBUF);
+ dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+ dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_COLLISIONS);
+ dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_LINK_STATE);
/*
* Ethernet specific kstat "link_duplex"
*/
if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
- knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+ dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
} else {
- val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
- knp->value.ui32 = (uint32_t)val;
+ dkp->dk_link_duplex.value.ui32 =
+ (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
}
- knp++;
- knp->value.ui32 = dlp->dl_unknowns;
+
+ dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
return (0);
}
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
int
dls_stat_create(const char *module, int instance, const char *name,
zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
- kstat_t **kspp)
+ kstat_t **kspp, zoneid_t newzoneid)
{
kstat_t *ksp;
- kstat_named_t *knp;
- uint_t i;
+ zone_t *zone;
+ dls_kstat_t *dkp;
if ((ksp = kstat_create_zone(module, instance, name, "net",
- KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+ KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
return (EINVAL);
}
ksp->ks_update = update;
ksp->ks_private = private;
+ dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+ if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ }
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- kstat_named_init(knp, i_dls_si[i].msi_name,
- i_dls_si[i].msi_type);
- knp++;
+ kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+ if (zone != NULL) {
+ kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+ zone_rele(zone);
}
- kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
- kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
kstat_install(ksp);
*kspp = ksp;
return (0);
}
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+ void *data;
+ if (ksp != NULL) {
+ data = ksp->ks_data;
+ kstat_delete(ksp);
+ kmem_free(data, sizeof (dls_kstat_t));
+ }
+}
diff --git a/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_eth.c b/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_eth.c
index 6b999192a9..249d0ffabd 100644
--- a/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_eth.c
+++ b/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_eth.c
@@ -969,7 +969,7 @@ fcoei_fill_els_fpkt_resp(fcoe_frame_t *frm, fcoei_exchange_t *xch, int size)
prli_acc->image_pair_established =
(FCOE_B2V_2(src + offset) & BIT_13) ? 1 : 0;
prli_acc->accept_response_code =
- FCOE_B2V_2(src + offset) & 0x0F00;
+ (uint16_t)(FCOE_B2V_2(src + offset) & 0x0F00);
/*
* process associator
*/
diff --git a/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_lv.c b/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_lv.c
index d1dce9b1f3..66a61e620e 100644
--- a/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_lv.c
+++ b/usr/src/uts/common/io/fibre-channel/fca/fcoei/fcoei_lv.c
@@ -1455,9 +1455,9 @@ fcoei_fill_els_prli_cmd(fc_packet_t *fpkt, fcoe_frame_t *frm)
* PRLI flags, only 3 bits are valid
*/
offset = 6;
- FCOE_V2B_2((fcp_spp->orig_process_assoc_valid * BIT_15) |
+ FCOE_V2B_2((uint8_t)((fcp_spp->orig_process_assoc_valid * BIT_15) |
(fcp_spp->resp_process_assoc_valid * BIT_14) |
- (fcp_spp->establish_image_pair * BIT_13), FPLD + offset);
+ (fcp_spp->establish_image_pair * BIT_13)), FPLD + offset);
/*
* process associator
diff --git a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
index da00160b68..2efb178ff1 100644
--- a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
+++ b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c
@@ -532,8 +532,7 @@ oce_drain_rq_cq(void *arg)
if (dev->function_mode & FLEX10_MODE) {
if (cqe->u0.s.vlan_tag_present &&
cqe->u0.s.qnq) {
- oce_rx_insert_tag(mp,
- cqe->u0.s.vlan_tag);
+ oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag);
}
} else if (cqe->u0.s.vlan_tag_present) {
oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag);
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_82599.c b/usr/src/uts/common/io/ixgbe/ixgbe_82599.c
index d6562dd641..fbc35e018a 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_82599.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_82599.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2010, Intel Corporation
+ Copyright (c) 2001-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -423,6 +423,7 @@ enum ixgbe_media_type ixgbe_get_media_type_82599(struct ixgbe_hw *hw)
case IXGBE_DEV_ID_82599_KX4:
case IXGBE_DEV_ID_82599_KX4_MEZZ:
case IXGBE_DEV_ID_82599_COMBO_BACKPLANE:
+ case IXGBE_DEV_ID_82599_KR:
case IXGBE_DEV_ID_82599_BACKPLANE_FCOE:
case IXGBE_DEV_ID_82599_XAUI_LOM:
/* Default device ID is mezzanine card KX/KX4 */
@@ -430,6 +431,9 @@ enum ixgbe_media_type ixgbe_get_media_type_82599(struct ixgbe_hw *hw)
break;
case IXGBE_DEV_ID_82599_SFP:
case IXGBE_DEV_ID_82599_SFP_FCOE:
+ case IXGBE_DEV_ID_82599_SFP_EM:
+ case IXGBE_DEV_ID_82599_SFP_SF2:
+ case IXGBE_DEV_ID_82599EN_SFP:
media_type = ixgbe_media_type_fiber;
break;
case IXGBE_DEV_ID_82599_CX4:
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_api.c b/usr/src/uts/common/io/ixgbe/ixgbe_api.c
index 397655f802..31635f43fd 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_api.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_api.c
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2010, Intel Corporation
+ Copyright (c) 2001-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -109,9 +109,13 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw)
case IXGBE_DEV_ID_82599_KX4_MEZZ:
case IXGBE_DEV_ID_82599_XAUI_LOM:
case IXGBE_DEV_ID_82599_COMBO_BACKPLANE:
+ case IXGBE_DEV_ID_82599_KR:
case IXGBE_DEV_ID_82599_SFP:
case IXGBE_DEV_ID_82599_BACKPLANE_FCOE:
case IXGBE_DEV_ID_82599_SFP_FCOE:
+ case IXGBE_DEV_ID_82599_SFP_EM:
+ case IXGBE_DEV_ID_82599_SFP_SF2:
+ case IXGBE_DEV_ID_82599EN_SFP:
case IXGBE_DEV_ID_82599_CX4:
case IXGBE_DEV_ID_82599_T3_LOM:
hw->mac.type = ixgbe_mac_82599EB;
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_type.h b/usr/src/uts/common/io/ixgbe/ixgbe_type.h
index 671a59113f..1f6c88c99d 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_type.h
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_type.h
@@ -1,6 +1,6 @@
/******************************************************************************
- Copyright (c) 2001-2010, Intel Corporation
+ Copyright (c) 2001-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -41,30 +41,34 @@
#define IXGBE_INTEL_VENDOR_ID 0x8086
/* Device IDs */
-#define IXGBE_DEV_ID_82598 0x10B6
-#define IXGBE_DEV_ID_82598_BX 0x1508
-#define IXGBE_DEV_ID_82598AF_DUAL_PORT 0x10C6
-#define IXGBE_DEV_ID_82598AF_SINGLE_PORT 0x10C7
-#define IXGBE_DEV_ID_82598AT 0x10C8
-#define IXGBE_DEV_ID_82598AT2 0x150B
-#define IXGBE_DEV_ID_82598EB_SFP_LOM 0x10DB
-#define IXGBE_DEV_ID_82598EB_CX4 0x10DD
-#define IXGBE_DEV_ID_82598_CX4_DUAL_PORT 0x10EC
-#define IXGBE_DEV_ID_82598_DA_DUAL_PORT 0x10F1
+#define IXGBE_DEV_ID_82598 0x10B6
+#define IXGBE_DEV_ID_82598_BX 0x1508
+#define IXGBE_DEV_ID_82598AF_DUAL_PORT 0x10C6
+#define IXGBE_DEV_ID_82598AF_SINGLE_PORT 0x10C7
+#define IXGBE_DEV_ID_82598AT 0x10C8
+#define IXGBE_DEV_ID_82598AT2 0x150B
+#define IXGBE_DEV_ID_82598EB_SFP_LOM 0x10DB
+#define IXGBE_DEV_ID_82598EB_CX4 0x10DD
+#define IXGBE_DEV_ID_82598_CX4_DUAL_PORT 0x10EC
+#define IXGBE_DEV_ID_82598_DA_DUAL_PORT 0x10F1
#define IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM 0x10E1
-#define IXGBE_DEV_ID_82598EB_XF_LR 0x10F4
-#define IXGBE_DEV_ID_82599_KX4 0x10F7
-#define IXGBE_DEV_ID_82599_KX4_MEZZ 0x1514
+#define IXGBE_DEV_ID_82598EB_XF_LR 0x10F4
+#define IXGBE_DEV_ID_82599_KX4 0x10F7
+#define IXGBE_DEV_ID_82599_KX4_MEZZ 0x1514
+#define IXGBE_DEV_ID_82599_KR 0x1517
#define IXGBE_DEV_ID_82599_COMBO_BACKPLANE 0x10F8
#define IXGBE_SUBDEV_ID_82599_KX4_KR_MEZZ 0x000C
-#define IXGBE_DEV_ID_82599_CX4 0x10F9
-#define IXGBE_DEV_ID_82599_SFP 0x10FB
-#define IXGBE_SUBDEV_ID_82599_SFP 0x11A9
+#define IXGBE_DEV_ID_82599_CX4 0x10F9
+#define IXGBE_DEV_ID_82599_SFP 0x10FB
+#define IXGBE_SUBDEV_ID_82599_SFP 0x11A9
#define IXGBE_DEV_ID_82599_BACKPLANE_FCOE 0x152A
-#define IXGBE_DEV_ID_82599_SFP_FCOE 0x1529
-#define IXGBE_DEV_ID_82599_XAUI_LOM 0x10FC
-#define IXGBE_DEV_ID_82599_T3_LOM 0x151C
-#define IXGBE_DEV_ID_82599_VF 0x10ED
+#define IXGBE_DEV_ID_82599_SFP_FCOE 0x1529
+#define IXGBE_DEV_ID_82599_SFP_EM 0x1507
+#define IXGBE_DEV_ID_82599_SFP_SF2 0x154D
+#define IXGBE_DEV_ID_82599EN_SFP 0x1557
+#define IXGBE_DEV_ID_82599_XAUI_LOM 0x10FC
+#define IXGBE_DEV_ID_82599_T3_LOM 0x151C
+#define IXGBE_DEV_ID_82599_VF 0x10ED
/* General Registers */
#define IXGBE_CTRL 0x00000
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index dc1132941b..58547b7c10 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -4032,16 +4032,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
/*
* To get the capabilities that MAC layer cares about, such as rings, factory
* mac address, vnic or not, it should directly invoke this function. If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
*/
boolean_t
i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
{
mac_impl_t *mip = (mac_impl_t *)mh;
- if (mip->mi_bridge_link != NULL)
- return (cap == MAC_CAPAB_NO_ZCOPY);
+ if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY)
+ return (B_TRUE);
else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB)
return (mip->mi_getcapab(mip->mi_driver, cap, cap_data));
else
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 38967e5d15..06a5ac8cbf 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -530,12 +530,13 @@ enum pkt_type {
/*
* In general we do port based hashing to spread traffic over different
- * softrings. The below tunable allows to override that behavior. Setting it
- * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
- * is also the applicable to ipv6 packets carrying multiple optional headers
+ * softrings. The below tunables allows to override that behavior. Setting it
+ * to B_TRUE allows to do a fanout based on src ipv6/ipv4 address. This behavior
+ * is also applicable to ipv6 packets carrying multiple optional headers
* and other uncommon packet types.
*/
boolean_t mac_src_ipv6_fanout = B_FALSE;
+boolean_t mac_src_ipv4_fanout = B_FALSE;
/*
* Pair of local and remote ports in the transport header
@@ -765,13 +766,14 @@ int fanout_unalligned = 0;
/*
* mac_rx_srs_long_fanout
*
- * The fanout routine for IPv6
+ * The fanout routine for IPv6 (and IPv4 when VLANs are in use).
*/
static int
mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
{
ip6_t *ip6h;
+ struct ip *ip4h;
uint8_t *whereptr;
uint_t hash;
uint16_t remlen;
@@ -839,7 +841,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
*/
if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h,
mp->b_wptr, &hdr_len, &nexthdr, NULL)) {
- goto src_based_fanout;
+ goto ipv6_src_based_fanout;
}
whereptr = (uint8_t *)ip6h + hdr_len;
@@ -856,7 +858,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
*/
if (mp->b_cont != NULL &&
whereptr + PORTS_SIZE > mp->b_wptr) {
- goto src_based_fanout;
+ goto ipv6_src_based_fanout;
}
break;
default:
@@ -890,7 +892,85 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
/* For all other protocol, do source based fanout */
default:
- goto src_based_fanout;
+ goto ipv6_src_based_fanout;
+ }
+ } else if (sap == ETHERTYPE_IP) {
+ boolean_t modifiable = B_TRUE;
+
+ ASSERT(MBLKL(mp) >= hdrsize);
+
+ ip4h = (struct ip *)(mp->b_rptr + hdrsize);
+
+ if ((unsigned char *)ip4h == mp->b_wptr) {
+ /*
+ * The first mblk_t only includes the mac header.
+ * Note that it is safe to change the mp pointer here,
+ * as the subsequent operation does not assume mp
+ * points to the start of the mac header.
+ */
+ mp = mp->b_cont;
+
+ /*
+ * Make sure ip4h holds the full base ip structure
+ * up through the destination address. It might not
+ * hold any of the options though.
+ */
+ if (mp == NULL)
+ return (-1);
+
+ if (MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
+ modifiable = (DB_REF(mp) == 1);
+
+ if (modifiable &&
+ !pullupmsg(mp, IP_SIMPLE_HDR_LENGTH))
+ return (-1);
+ }
+
+ ip4h = (struct ip *)mp->b_rptr;
+ }
+
+ if (!modifiable || !(OK_32PTR((char *)ip4h))) {
+ /*
+ * If ip4h is not aligned fanout to the default ring.
+ * Note that this may cause packets reordering.
+ */
+ *indx = 0;
+ *type = OTH;
+ fanout_unalligned++;
+ return (0);
+ }
+
+ /* Do src based fanout if below tunable is set to B_TRUE. */
+ if (mac_src_ipv4_fanout)
+ goto ipv4_src_based_fanout;
+
+ /* If the transport is TCP, we try to do port based fanout */
+ if (ip4h->ip_p == IPPROTO_TCP) {
+ int hdr_len;
+
+ hdr_len = ip4h->ip_hl << 2;
+ /* set whereptr to point to tcphdr */
+ whereptr = (uint8_t *)ip4h + hdr_len;
+
+ /*
+ * If ip4h does not hold the complete ip header
+ * including options, or if both ports in the TCP
+ * header are not part of the mblk, do src_based_fanout
+ * (the second case covers the first one so we only
+ * need one test).
+ */
+ if (mp->b_cont != NULL &&
+ whereptr + PORTS_SIZE > mp->b_wptr)
+ goto ipv4_src_based_fanout;
+
+ hash = HASH_ADDR(ip4h->ip_src.s_addr,
+ *(uint32_t *)whereptr);
+ *indx = COMPUTE_INDEX(hash,
+ mac_srs->srs_tcp_ring_count);
+ *type = OTH;
+ } else {
+ /* For all other protocols, do source based fanout */
+ goto ipv4_src_based_fanout;
}
} else {
*indx = 0;
@@ -898,11 +978,17 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
}
return (0);
-src_based_fanout:
+ipv6_src_based_fanout:
hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
*type = OTH;
return (0);
+
+ipv4_src_based_fanout:
+ hash = HASH_ADDR(ip4h->ip_src.s_addr, (uint32_t)0);
+ *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
+ *type = OTH;
+ return (0);
}
/*
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index 501bca39c8..8ec8672eb9 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -3502,9 +3502,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
* according to the successful response to the page
* 0x2A mode sense request.
*/
- scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
- "sd_set_mmc_caps: Mode Sense returned "
- "invalid block descriptor length\n");
+ /*
+ * The following warning occurs due to the KVM CD-ROM
+ * mishandling the multi-media commands. Ignore it.
+ * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+ * "sd_set_mmc_caps: Mode Sense returned "
+ * "invalid block descriptor length\n");
+ */
kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
return;
}
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c
index 0db01f80d7..c3d04e5508 100644
--- a/usr/src/uts/common/os/bio.c
+++ b/usr/src/uts/common/os/bio.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
cpup = CPU; /* get pointer AFTER preemption is disabled */
CPU_STATS_ADDQ(cpup, vm, pgin, 1);
CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
+
+ atomic_add_64(&curzone->zone_pgpgin, btopr(len));
+
if ((flags & B_ASYNC) == 0) {
klwp_t *lwp = ttolwp(curthread);
if (lwp != NULL)
@@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
if (pp != NULL && pp->p_vnode != NULL) {
if (IS_SWAPFSVP(pp->p_vnode)) {
CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
+ atomic_add_64(&curzone->zone_anonpgin,
+ btopr(len));
} else {
if (pp->p_vnode->v_flag & VVMEXEC) {
CPU_STATS_ADDQ(cpup, vm, execpgin,
btopr(len));
+ atomic_add_64(&curzone->zone_execpgin,
+ btopr(len));
} else {
CPU_STATS_ADDQ(cpup, vm, fspgin,
btopr(len));
+ atomic_add_64(&curzone->zone_fspgin,
+ btopr(len));
}
}
}
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 451c9db48c..3f4dd63c82 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -23,6 +23,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -66,6 +67,7 @@
#include <sys/ddi_timer.h>
#include <sys/random.h>
#include <sys/modctl.h>
+#include <sys/zone.h>
/*
* for NTP support
@@ -1158,6 +1160,10 @@ loadavg_update()
} while ((cpupart = cpupart->cp_next) != cp_list_head);
+ /*
+ * Third pass totals up per-zone statistics.
+ */
+ zone_loadavg_update();
}
/*
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index e097f355ec..7870617a26 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -24,7 +24,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ */
#include <sys/timer.h>
#include <sys/systm.h>
@@ -112,6 +114,25 @@ clock_highres_timer_settime(itimer_t *it, int flags,
cyctime.cyt_when = ts2hrt(&when->it_value);
cyctime.cyt_interval = ts2hrt(&when->it_interval);
+ if (cyctime.cyt_when != 0 && cyctime.cyt_interval == 0 &&
+ it->it_itime.it_interval.tv_sec == 0 &&
+ it->it_itime.it_interval.tv_nsec == 0 &&
+ (cyc = *cycp) != CYCLIC_NONE) {
+ /*
+ * If our existing timer is a one-shot and our new timer is a
+ * one-shot, we'll save ourselves a world of grief and just
+ * reprogram the cyclic.
+ */
+ it->it_itime = *when;
+
+ if (!(flags & TIMER_ABSTIME))
+ cyctime.cyt_when += gethrtime();
+
+ hrt2ts(cyctime.cyt_when, &it->it_itime.it_value);
+ (void) cyclic_reprogram(cyc, cyctime.cyt_when);
+ return (0);
+ }
+
mutex_enter(&cpu_lock);
if ((cyc = *cycp) != CYCLIC_NONE) {
cyclic_remove(cyc);
@@ -162,17 +183,14 @@ clock_highres_timer_settime(itimer_t *it, int flags,
if (cyctime.cyt_interval == 0) {
/*
- * If this is a one-shot, then we set the interval to assure
- * that the cyclic will next fire INT64_MAX nanoseconds after
- * boot (which corresponds to over 292 years -- yes, Buck Rogers
- * may have his 292-year-uptime-Solaris box malfunction). If
- * this timer is never touched, this cyclic will simply
- * consume space in the cyclic subsystem. As soon as
+ * If this is a one-shot, then we set the interval to be
+ * inifinite. If this timer is never touched, this cyclic will
+ * simply consume space in the cyclic subsystem. As soon as
* timer_settime() or timer_delete() is called, the cyclic is
* removed (so it's not possible to run the machine out
* of resources by creating one-shots).
*/
- cyctime.cyt_interval = INT64_MAX - cyctime.cyt_when;
+ cyctime.cyt_interval = CY_INFINITY;
}
it->it_itime = *when;
@@ -185,8 +203,6 @@ clock_highres_timer_settime(itimer_t *it, int flags,
if (cyctime.cyt_when != 0)
*cycp = cyc = cyclic_add(&hdlr, &cyctime);
- else
- *cycp = cyc = CYCLIC_NONE;
/*
* Now that we have the cyclic created, we need to bind it to our
diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c
index a292f4e14f..ebaa6bfe41 100644
--- a/usr/src/uts/common/os/contract.c
+++ b/usr/src/uts/common/os/contract.c
@@ -497,7 +497,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit)
contract_t *parent = &p->p_ct_process->conp_contract;
int inherit = 0;
- ASSERT(p == curproc);
+ VERIFY(p == curproc);
mutex_enter(&ct->ct_lock);
@@ -547,7 +547,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit)
if (inherit) {
ct->ct_state = CTS_INHERITED;
- ASSERT(ct->ct_regent == parent);
+ VERIFY(ct->ct_regent == parent);
contract_process_take(parent, ct);
/*
@@ -2063,8 +2063,8 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
{
ct_kevent_t *e, *first = NULL;
- ASSERT(q->ctq_listno == CTEL_CONTRACT);
- ASSERT(newq->ctq_listno == CTEL_PBUNDLE);
+ VERIFY(q->ctq_listno == CTEL_CONTRACT);
+ VERIFY(newq->ctq_listno == CTEL_PBUNDLE);
mutex_enter(&q->ctq_lock);
mutex_enter(&newq->ctq_lock);
@@ -2077,8 +2077,16 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
if (first == NULL)
first = e;
- list_insert_tail(&newq->ctq_events, e);
- cte_hold(e);
+ /*
+ * It is possible for adoption to race with an owner's
+ * cte_publish_all(); we must only enqueue events that
+ * have not already been enqueued.
+ */
+ if (!list_link_active((list_node_t *)
+ ((uintptr_t)e + newq->ctq_events.list_offset))) {
+ list_insert_tail(&newq->ctq_events, e);
+ cte_hold(e);
+ }
}
}
@@ -2117,7 +2125,7 @@ cte_trim(ct_equeue_t *q, contract_t *ct)
int flags, stopper;
int start = 1;
- ASSERT(MUTEX_HELD(&q->ctq_lock));
+ VERIFY(MUTEX_HELD(&q->ctq_lock));
for (e = list_head(&q->ctq_events); e != NULL; e = next) {
next = list_next(&q->ctq_events, e);
@@ -2227,13 +2235,24 @@ cte_queue_drain(ct_equeue_t *q, int ack)
* cte_publish_all.
*/
static void
-cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
+cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist)
{
ASSERT(MUTEX_HELD(&q->ctq_lock));
q->ctq_atime = *tsp;
/*
+ * If this event may already exist on this queue, check to see if it
+ * is already there and return if so.
+ */
+ if (mayexist && list_link_active((list_node_t *)((uintptr_t)e +
+ q->ctq_events.list_offset))) {
+ mutex_exit(&q->ctq_lock);
+ cte_rele(e);
+ return;
+ }
+
+ /*
* Don't publish if the event is informative and there aren't
* any listeners, or if the queue has been shut down.
*/
@@ -2247,6 +2266,8 @@ cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
/*
* Enqueue event
*/
+ VERIFY(!list_link_active((list_node_t *)
+ ((uintptr_t)e + q->ctq_events.list_offset)));
list_insert_tail(&q->ctq_events, e);
/*
@@ -2318,14 +2339,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
ct->ct_evcnt++;
}
mutex_exit(&ct->ct_lock);
- cte_publish(&ct->ct_events, e, &ts);
+ cte_publish(&ct->ct_events, e, &ts, B_FALSE);
/*
* CTEL_BUNDLE - Next deliver to the contract type's bundle
* queue.
*/
mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
- cte_publish(&ct->ct_type->ct_type_events, e, &ts);
+ cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE);
/*
* CTEL_PBUNDLE - Finally, if the contract has an owner,
@@ -2342,7 +2363,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
mutex_enter(&q->ctq_lock);
mutex_exit(&ct->ct_lock);
- cte_publish(q, e, &ts);
+
+ /*
+ * It is possible for this code to race with adoption; we
+ * publish the event indicating that the event may already
+ * be enqueued because adoption beat us to it (in which case
+ * cte_pubish() does nothing).
+ */
+ cte_publish(q, e, &ts, B_TRUE);
} else {
mutex_exit(&ct->ct_lock);
cte_rele(e);
diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c
index 9e04f631a9..3b3935a772 100644
--- a/usr/src/uts/common/os/core.c
+++ b/usr/src/uts/common/os/core.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -534,6 +535,10 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr)
case 'z':
s = p->p_zone->zone_name;
break;
+ case 'Z':
+ /* This is zonepath + "/root/", except for GZ */
+ s = p->p_zone->zone_rootpath;
+ break;
case '%':
(void) strcpy((s = buf), "%");
break;
@@ -548,6 +553,9 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr)
if ((size -= len) <= 0)
return (ENAMETOOLONG);
(void) strcpy(fp, s);
+ /* strip trailing "/root/" from non-GZ zonepath string */
+ if (c == 'Z' && len > 6)
+ len -= 6;
fp += len;
}
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 1ec63249ab..20e57efaad 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -724,6 +724,14 @@ crgetzoneid(const cred_t *cr)
cr->cr_zone->zone_id);
}
+zoneid_t
+crgetzonedid(const cred_t *cr)
+{
+ return (cr->cr_zone == NULL ?
+ (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) :
+ cr->cr_zone->zone_did);
+}
+
projid_t
crgetprojid(const cred_t *cr)
{
diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c
index 1bb6baf445..93a318d260 100644
--- a/usr/src/uts/common/os/cyclic.c
+++ b/usr/src/uts/common/os/cyclic.c
@@ -24,6 +24,10 @@
*/
/*
+ * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ */
+
+/*
* The Cyclic Subsystem
* --------------------
*
@@ -1139,7 +1143,7 @@ top:
CYC_TRACE(cpu, level, "softint-top", cyclics, pc);
while (consndx != pc->cypc_prodndx) {
- int pend, npend, opend;
+ uint32_t pend, npend, opend;
int consmasked = consndx & sizemask;
cyclic_t *cyclic = &cyclics[buf[consmasked]];
cyc_func_t handler = cyclic->cy_handler;
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index b97a09454b..7c5b8323e3 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -388,10 +389,16 @@ proc_exit(int why, int what)
if (p->p_pid == z->zone_proc_initpid) {
if (z->zone_boot_err == 0 &&
zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
- zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
- z->zone_restart_init == B_TRUE &&
- restart_init(what, why) == 0)
- return (0);
+ zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
+ if (z->zone_restart_init == B_TRUE) {
+ if (restart_init(what, why) == 0)
+ return (0);
+ } else {
+ (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
+ CRED());
+ }
+ }
+
/*
* Since we didn't or couldn't restart init, we clear
* the zone's init state and proceed with exit
diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c
index 83b817e866..a5f5a6f3c2 100644
--- a/usr/src/uts/common/os/kstat_fr.c
+++ b/usr/src/uts/common/os/kstat_fr.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/*
@@ -160,6 +161,7 @@ struct {
kstat_named_t avenrun_5min;
kstat_named_t avenrun_15min;
kstat_named_t boot_time;
+ kstat_named_t nsec_per_tick;
} system_misc_kstat = {
{ "ncpus", KSTAT_DATA_UINT32 },
{ "lbolt", KSTAT_DATA_UINT32 },
@@ -171,6 +173,7 @@ struct {
{ "avenrun_5min", KSTAT_DATA_UINT32 },
{ "avenrun_15min", KSTAT_DATA_UINT32 },
{ "boot_time", KSTAT_DATA_UINT32 },
+ { "nsec_per_tick", KSTAT_DATA_UINT32 },
};
struct {
@@ -803,7 +806,6 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
{
int myncpus = ncpus;
int *loadavgp = &avenrun[0];
- int loadavg[LOADAVG_NSTATS];
time_t zone_boot_time;
clock_t zone_lbolt;
hrtime_t zone_hrtime;
@@ -820,17 +822,11 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
*/
mutex_enter(&cpu_lock);
if (pool_pset_enabled()) {
- psetid_t mypsid = zone_pset_get(curproc->p_zone);
- int error;
-
myncpus = zone_ncpus_get(curproc->p_zone);
ASSERT(myncpus > 0);
- error = cpupart_get_loadavg(mypsid, &loadavg[0],
- LOADAVG_NSTATS);
- ASSERT(error == 0);
- loadavgp = &loadavg[0];
}
mutex_exit(&cpu_lock);
+ loadavgp = &curproc->p_zone->zone_avenrun[0];
}
if (INGLOBALZONE(curproc)) {
@@ -838,9 +834,7 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
zone_lbolt = ddi_get_lbolt();
zone_nproc = nproc;
} else {
- struct timeval tvp;
- hrt2tv(curproc->p_zone->zone_zsched->p_mstart, &tvp);
- zone_boot_time = tvp.tv_sec;
+ zone_boot_time = curproc->p_zone->zone_boot_time;
zone_hrtime = gethrtime();
zone_lbolt = (clock_t)(NSEC_TO_TICK(zone_hrtime) -
@@ -861,6 +855,8 @@ system_misc_kstat_update(kstat_t *ksp, int rw)
system_misc_kstat.avenrun_15min.value.ui32 = (uint32_t)loadavgp[2];
system_misc_kstat.boot_time.value.ui32 = (uint32_t)
zone_boot_time;
+ system_misc_kstat.nsec_per_tick.value.ui32 = (uint32_t)
+ nsec_per_tick;
return (0);
}
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index f5cebbf82e..63a89a2ce8 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -248,8 +248,7 @@ log_init(void)
*/
printf("\rSunOS Release %s Version %s %u-bit\n",
utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
- printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. "
- "All rights reserved.\n");
+ printf("Copyright (c) 2010-2012, Joyent Inc. All rights reserved.\n");
#ifdef DEBUG
printf("DEBUG enabled\n");
#endif
diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c
index df975eb7ee..c10dce81ca 100644
--- a/usr/src/uts/common/os/msacct.c
+++ b/usr/src/uts/common/os/msacct.c
@@ -33,6 +33,7 @@
#include <sys/debug.h>
#include <sys/msacct.h>
#include <sys/time.h>
+#include <sys/zone.h>
/*
* Mega-theory block comment:
@@ -390,6 +391,7 @@ void
syscall_mstate(int fromms, int toms)
{
kthread_t *t = curthread;
+ zone_t *z = ttozone(t);
struct mstate *ms;
hrtime_t *mstimep;
hrtime_t curtime;
@@ -413,6 +415,10 @@ syscall_mstate(int fromms, int toms)
newtime = curtime - ms->ms_state_start;
}
*mstimep += newtime;
+ if (fromms == LMS_USER)
+ atomic_add_64(&z->zone_utime, newtime);
+ else if (fromms == LMS_SYSTEM)
+ atomic_add_64(&z->zone_stime, newtime);
t->t_mstate = toms;
ms->ms_state_start = curtime;
ms->ms_prev = fromms;
@@ -602,7 +608,10 @@ new_mstate(kthread_t *t, int new_state)
hrtime_t curtime;
hrtime_t newtime;
hrtime_t oldtime;
+ hrtime_t ztime;
+ hrtime_t origstart;
klwp_t *lwp;
+ zone_t *z;
ASSERT(new_state != LMS_WAIT_CPU);
ASSERT((unsigned)new_state < NMSTATES);
@@ -625,6 +634,7 @@ new_mstate(kthread_t *t, int new_state)
ms = &lwp->lwp_mstate;
state = t->t_mstate;
+ origstart = ms->ms_state_start;
do {
switch (state) {
case LMS_TFAULT:
@@ -637,7 +647,7 @@ new_mstate(kthread_t *t, int new_state)
mstimep = &ms->ms_acct[state];
break;
}
- newtime = curtime - ms->ms_state_start;
+ ztime = newtime = curtime - ms->ms_state_start;
if (newtime < 0) {
curtime = gethrtime_unscaled();
oldtime = *mstimep - 1; /* force CAS to fail */
@@ -648,6 +658,20 @@ new_mstate(kthread_t *t, int new_state)
t->t_mstate = new_state;
ms->ms_state_start = curtime;
} while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime);
+
+ /*
+ * When the system boots the initial startup thread will have a
+ * ms_state_start of 0 which would add a huge system time to the global
+ * zone. We want to skip aggregating that initial bit of work.
+ */
+ if (origstart != 0) {
+ z = ttozone(t);
+ if (state == LMS_USER)
+ atomic_add_64(&z->zone_utime, ztime);
+ else if (state == LMS_SYSTEM)
+ atomic_add_64(&z->zone_stime, ztime);
+ }
+
/*
* Remember the previous running microstate.
*/
@@ -686,6 +710,8 @@ restore_mstate(kthread_t *t)
hrtime_t waitrq;
hrtime_t newtime;
hrtime_t oldtime;
+ hrtime_t waittime;
+ zone_t *z;
/*
* Don't call restore mstate of threads without lwps. (Kernel threads)
@@ -756,11 +782,15 @@ restore_mstate(kthread_t *t)
oldtime = *mstimep;
newtime += oldtime;
} while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime);
+
/*
* Update the WAIT_CPU timer and per-cpu waitrq total.
*/
- ms->ms_acct[LMS_WAIT_CPU] += (curtime - waitrq);
- CPU->cpu_waitrq += (curtime - waitrq);
+ z = ttozone(t);
+ waittime = curtime - waitrq;
+ ms->ms_acct[LMS_WAIT_CPU] += waittime;
+ atomic_add_64(&z->zone_wtime, waittime);
+ CPU->cpu_waitrq += waittime;
ms->ms_state_start = curtime;
}
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 573ebbc367..d8f7882723 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -2563,3 +2564,12 @@ secpolicy_ppp_config(const cred_t *cr)
return (secpolicy_net_config(cr, B_FALSE));
return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL));
}
+
+int
+secpolicy_hyprlofs_control(const cred_t *cr)
+{
+ if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL))
+ return (EPERM);
+ return (0);
+}
+
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index a5a918b326..53617bd0fe 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -176,6 +176,10 @@ privilege PRIV_GRAPHICS_MAP
Allows a process to perform privileged mappings through a
graphics device.
+privilege PRIV_HYPRLOFS_CONTROL
+
+ Allows a process to manage hyprlofs entries.
+
privilege PRIV_IPC_DAC_READ
Allows a process to read a System V IPC
diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c
index f0027d18e9..d444def5cc 100644
--- a/usr/src/uts/common/os/vmem.c
+++ b/usr/src/uts/common/os/vmem.c
@@ -1599,7 +1599,7 @@ vmem_destroy(vmem_t *vmp)
leaked = vmem_size(vmp, VMEM_ALLOC);
if (leaked != 0)
- cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s",
+ cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s",
vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ?
"identifiers" : "bytes");
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 79ccd94ae4..4b9dc9fc93 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved.
*/
/*
@@ -369,8 +370,12 @@ static char *zone_ref_subsys_names[] = {
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
+rctl_hndl_t rc_zone_cpu_baseline;
+rctl_hndl_t rc_zone_cpu_burst_time;
+rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
rctl_hndl_t rc_zone_shmmax;
@@ -423,8 +428,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
* Version 5 alters the zone_boot system call, and converts its old
* bootargs parameter to be set by the zone_setattr API instead.
* Version 6 adds the flag argument to zone_create.
+ * Version 7 adds the requested zoneid to zone_create.
*/
-static const int ZONE_SYSCALL_API_VERSION = 6;
+static const int ZONE_SYSCALL_API_VERSION = 7;
/*
* Certain filesystems (such as NFS and autofs) need to know which zone
@@ -1380,6 +1386,114 @@ static rctl_ops_t zone_cpu_cap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_cpu_base_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_base(p->p_zone));
+}
+
+/*
+ * The zone cpu base is used to set the baseline CPU for the zone
+ * so we can track when the zone is bursting.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_base(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_base_ops = {
+ rcop_no_action,
+ zone_cpu_base_get,
+ zone_cpu_base_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (cpucaps_zone_get_burst_time(p->p_zone));
+}
+
+/*
+ * The zone cpu burst time is used to set the amount of time CPU(s) can be
+ * bursting for the zone.
+ */
+/*ARGSUSED*/
+static int
+zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ return (cpucaps_zone_set_burst_time(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_burst_time_ops = {
+ rcop_no_action,
+ zone_cpu_burst_time_get,
+ zone_cpu_burst_time_set,
+ rcop_no_test
+};
+
+/*
+ * zone.zfs-io-pri resource control support (IO priority).
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ return (p->p_zone->zone_zfs_io_pri);
+}
+
+/*ARGSUSED*/
+static int
+zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ zone_t *zone = e->rcep_p.zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+ if (zone == NULL)
+ return (0);
+
+ /*
+ * set priority to the new value.
+ */
+ zone->zone_zfs_io_pri = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_zfs_io_pri_ops = {
+ rcop_no_action,
+ zone_zfs_io_pri_get,
+ zone_zfs_io_pri_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_lwps_usage(rctl_t *r, proc_t *p)
{
rctl_qty_t nlwps;
@@ -1674,6 +1788,39 @@ static rctl_ops_t zone_max_swap_ops = {
/*ARGSUSED*/
static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ /* No additional lock because not enforced in the kernel */
+ q = z->zone_phys_mem;
+ return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+{
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_phys_mem_ctl = nv;
+ return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
@@ -1767,6 +1914,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
}
static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+ zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+ return (0);
+}
+
+static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
@@ -1795,7 +1956,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw)
}
static kstat_t *
-zone_kstat_create_common(zone_t *zone, char *name,
+zone_rctl_kstat_create_common(zone_t *zone, char *name,
int (*updatefunc) (kstat_t *, int))
{
kstat_t *ksp;
@@ -1820,26 +1981,350 @@ zone_kstat_create_common(zone_t *zone, char *name,
return (ksp);
}
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the I/O throttle
+ * counters are updated directly by the ZFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_mcap_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_mcap_kstat_t *zmp = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+ zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+ zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
+ zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
+ zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
+ zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
+ zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
+ zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+ zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+ zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
+
+ return (0);
+}
+
+static kstat_t *
+zone_mcap_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_mcap_kstat_t *zmp;
+
+ if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
+ zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
+ sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_mcap_lock;
+ zone->zone_mcap_stats = zmp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+ KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_mcap_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
+static int
+zone_misc_kstat_update(kstat_t *ksp, int rw)
+{
+ zone_t *zone = ksp->ks_private;
+ zone_misc_kstat_t *zmp = ksp->ks_data;
+ hrtime_t tmp;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ tmp = zone->zone_utime;
+ scalehrtime(&tmp);
+ zmp->zm_utime.value.ui64 = tmp;
+ tmp = zone->zone_stime;
+ scalehrtime(&tmp);
+ zmp->zm_stime.value.ui64 = tmp;
+ tmp = zone->zone_wtime;
+ scalehrtime(&tmp);
+ zmp->zm_wtime.value.ui64 = tmp;
+
+ zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
+ zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
+ zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
+
+ zmp->zm_run_ticks.value.ui64 = zone->zone_run_ticks;
+ zmp->zm_run_wait.value.ui64 = zone->zone_runq_cntr;
+ zmp->zm_fss_shr_pct.value.ui64 = zone->zone_fss_shr_pct;
+ zmp->zm_fss_pri_hi.value.ui64 = zone->zone_fss_pri_hi;
+ zmp->zm_fss_pri_avg.value.ui64 = zone->zone_fss_pri_avg;
+
+ return (0);
+}
+
+static kstat_t *
+zone_misc_kstat_create(zone_t *zone)
+{
+ kstat_t *ksp;
+ zone_misc_kstat_t *zmp;
+
+ if ((ksp = kstat_create_zone("zones", zone->zone_id,
+ zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
+ sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_misc_lock;
+ zone->zone_misc_stats = zmp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_run_ticks, "run_ticks", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_run_wait, "run_queue", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_shr_pct, "fss_share_percent",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&zmp->zm_fss_pri_hi, "fss_pri_hi", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_fss_pri_avg, "fss_pri_avg",
+ KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_misc_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+}
+
static void
zone_kstat_create(zone_t *zone)
{
- zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
+ zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
- zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
+ zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
- zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
+ zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
+ zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
+ zone->zone_zfs_stats = kmem_zalloc(
+ sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
+ zone->zone_mcap_stats = kmem_zalloc(
+ sizeof (zone_mcap_kstat_t), KM_SLEEP);
+ }
+
+ if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
+ zone->zone_misc_stats = kmem_zalloc(
+ sizeof (zone_misc_kstat_t), KM_SLEEP);
+ }
}
static void
-zone_kstat_delete_common(kstat_t **pkstat)
+zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
{
void *data;
if (*pkstat != NULL) {
data = (*pkstat)->ks_data;
kstat_delete(*pkstat);
- kmem_free(data, sizeof (zone_kstat_t));
+ kmem_free(data, datasz);
*pkstat = NULL;
}
}
@@ -1847,9 +2332,23 @@ zone_kstat_delete_common(kstat_t **pkstat)
static void
zone_kstat_delete(zone_t *zone)
{
- zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
- zone_kstat_delete_common(&zone->zone_swapresv_kstat);
- zone_kstat_delete_common(&zone->zone_nprocs_kstat);
+ zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_swapresv_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_nprocs_kstat,
+ sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_zfs_ksp,
+ sizeof (zone_zfs_kstat_t));
+ zone_kstat_delete_common(&zone->zone_mcap_ksp,
+ sizeof (zone_mcap_kstat_t));
+ zone_kstat_delete_common(&zone->zone_misc_ksp,
+ sizeof (zone_misc_kstat_t));
}
/*
@@ -1883,6 +2382,8 @@ zone_zsd_init(void)
zone0.zone_locked_mem_ctl = UINT64_MAX;
ASSERT(zone0.zone_max_swap == 0);
zone0.zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_phys_mem = 0;
+ zone0.zone_phys_mem_ctl = UINT64_MAX;
zone0.zone_max_lofi = 0;
zone0.zone_max_lofi_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
@@ -1906,7 +2407,13 @@ zone_zsd_init(void)
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
+ zone0.zone_zfs_io_pri = 1;
+ zone0.zone_stime = 0;
+ zone0.zone_utime = 0;
+ zone0.zone_wtime = 0;
+
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2013,6 +2520,21 @@ zone_init(void)
RCTL_GLOBAL_INFINITE,
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
+ rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ MAXCAP, MAXCAP, &zone_cpu_base_ops);
+
+ rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
+
+ rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
+ RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
+ RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
+ 1024, 1024, &zone_zfs_io_pri_ops);
+
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
@@ -2054,6 +2576,20 @@ zone_init(void)
rde = rctl_dict_lookup("zone.cpu-shares");
(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+ /*
+ * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
+ * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
+ */
+ dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ bzero(dval, sizeof (rctl_val_t));
+ dval->rcv_value = 1;
+ dval->rcv_privilege = RCPRIV_PRIVILEGED;
+ dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
+ dval->rcv_action_recip_pid = -1;
+
+ rde = rctl_dict_lookup("zone.zfs-io-priority");
+ (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
+
rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2064,6 +2600,11 @@ zone_init(void)
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
@@ -2375,14 +2916,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname)
return (0);
}
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats. Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats. We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
{
- uint64_t mcap;
- int err = 0;
+ zone->zone_mcap_nover++;
+
+ return (0);
+}
+
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+ uint64_t pageout;
+ int err;
+
+ if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+ zone->zone_mcap_pagedout += pageout;
- if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
- zone->zone_phys_mcap = mcap;
+ return (err);
+}
+
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults. This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
+static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+ uint32_t dusec;
+ int err;
+
+ if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+ zone->zone_pg_flt_delay = dusec;
+
+ return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+ uint64_t rss;
+ int err;
+
+ if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mem = rss;
return (err);
}
@@ -2794,6 +3386,12 @@ getzoneid(void)
return (curproc->p_zone->zone_id);
}
+zoneid_t
+getzonedid(void)
+{
+ return (curproc->p_zone->zone_did);
+}
+
/*
* Internal versions of zone_find_by_*(). These don't zone_hold() or
* check the validity of a zone's state.
@@ -2977,6 +3575,92 @@ zone_find_by_path(const char *path)
}
/*
+ * Public interface for updating per-zone load averages. Called once per
+ * second.
+ *
+ * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
+ */
+void
+zone_loadavg_update()
+{
+ zone_t *zp;
+ zone_status_t status;
+ struct loadavg_s *lavg;
+ hrtime_t zone_total;
+ int i;
+ hrtime_t hr_avg;
+ int nrun;
+ static int64_t f[3] = { 135, 27, 9 };
+ int64_t q, r;
+
+ mutex_enter(&zonehash_lock);
+ for (zp = list_head(&zone_active); zp != NULL;
+ zp = list_next(&zone_active, zp)) {
+ mutex_enter(&zp->zone_lock);
+
+ /* Skip zones that are on the way down or not yet up */
+ status = zone_status_get(zp);
+ if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
+ /* For all practical purposes the zone doesn't exist. */
+ mutex_exit(&zp->zone_lock);
+ continue;
+ }
+
+ /*
+ * Update the 10 second moving average data in zone_loadavg.
+ */
+ lavg = &zp->zone_loadavg;
+
+ zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
+ scalehrtime(&zone_total);
+
+ /* The zone_total should always be increasing. */
+ lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
+ zone_total - lavg->lg_total : 0;
+ lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
+ /* lg_total holds the prev. 1 sec. total */
+ lavg->lg_total = zone_total;
+
+ /*
+ * To simplify the calculation, we don't calculate the load avg.
+ * until the zone has been up for at least 10 seconds and our
+ * moving average is thus full.
+ */
+ if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
+ lavg->lg_len++;
+ mutex_exit(&zp->zone_lock);
+ continue;
+ }
+
+ /* Now calculate the 1min, 5min, 15 min load avg. */
+ hr_avg = 0;
+ for (i = 0; i < S_LOADAVG_SZ; i++)
+ hr_avg += lavg->lg_loads[i];
+ hr_avg = hr_avg / S_LOADAVG_SZ;
+ nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
+
+ /* Compute load avg. See comment in calcloadavg() */
+ for (i = 0; i < 3; i++) {
+ q = (zp->zone_hp_avenrun[i] >> 16) << 7;
+ r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
+ zp->zone_hp_avenrun[i] +=
+ ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
+
+ /* avenrun[] can only hold 31 bits of load avg. */
+ if (zp->zone_hp_avenrun[i] <
+ ((uint64_t)1<<(31+16-FSHIFT)))
+ zp->zone_avenrun[i] = (int32_t)
+ (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
+ else
+ zp->zone_avenrun[i] = 0x7fffffff;
+ }
+
+ mutex_exit(&zp->zone_lock);
+ }
+ mutex_exit(&zonehash_lock);
+}
+
+/*
* Get the number of cpus visible to this zone. The system-wide global
* 'ncpus' is returned if pools are disabled, the caller is in the
* global zone, or a NULL zone argument is passed in.
@@ -3789,7 +4473,10 @@ zsched(void *arg)
mutex_enter(&zone_status_lock);
zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
mutex_exit(&zone_status_lock);
+ } else {
+ zone->zone_boot_time = gethrestime_sec();
}
+
pool_unlock();
}
@@ -4081,7 +4768,7 @@ zone_create(const char *zone_name, const char *zone_root,
caddr_t rctlbuf, size_t rctlbufsz,
caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
int match, uint32_t doi, const bslabel_t *label,
- int flags)
+ int flags, zoneid_t zone_did)
{
struct zsched_arg zarg;
nvlist_t *rctls = NULL;
@@ -4104,6 +4791,7 @@ zone_create(const char *zone_name, const char *zone_root,
zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
zoneid = zone->zone_id = id_alloc(zoneid_space);
+ zone->zone_did = zone_did;
zone->zone_status = ZONE_IS_UNINITIALIZED;
zone->zone_pool = pool_default;
zone->zone_pool_mod = gethrtime();
@@ -4172,10 +4860,14 @@ zone_create(const char *zone_name, const char *zone_root,
zone->zone_locked_mem_ctl = UINT64_MAX;
zone->zone_max_swap = 0;
zone->zone_max_swap_ctl = UINT64_MAX;
+ zone->zone_phys_mem = 0;
+ zone->zone_phys_mem_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
- zone0.zone_lockedmem_kstat = NULL;
- zone0.zone_swapresv_kstat = NULL;
+ zone->zone_lockedmem_kstat = NULL;
+ zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
+ zone->zone_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
@@ -4474,6 +5166,7 @@ zone_boot(zoneid_t zoneid)
static int
zone_empty(zone_t *zone)
{
+ int cnt = 0;
int waitstatus;
/*
@@ -4484,7 +5177,16 @@ zone_empty(zone_t *zone)
ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
while ((waitstatus = zone_status_timedwait_sig(zone,
ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
- killall(zone->zone_id);
+ boolean_t force = B_FALSE;
+
+ /* Every 30 seconds, try harder */
+ if (cnt++ >= 30) {
+ cmn_err(CE_WARN, "attempt to force kill zone %d\n",
+ zone->zone_id);
+ force = B_TRUE;
+ cnt = 0;
+ }
+ killall(zone->zone_id, force);
}
/*
* return EINTR if we were signaled
@@ -5222,14 +5924,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
@@ -5284,6 +5978,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_DID:
+ size = sizeof (zoneid_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
@@ -5315,10 +6017,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EPERM));
/*
- * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
- * global zone.
+ * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+ * attributes can be set on the global zone.
*/
- if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+ if (zoneid == GLOBAL_ZONEID &&
+ attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
return (set_errno(EINVAL));
}
@@ -5335,7 +6038,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+ attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+ zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
@@ -5344,6 +6049,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_INITNAME:
err = zone_set_initname(zone, (const char *)buf);
break;
+ case ZONE_ATTR_INITNORESTART:
+ zone->zone_restart_init = B_FALSE;
+ err = 0;
+ break;
case ZONE_ATTR_BOOTARGS:
err = zone_set_bootargs(zone, (const char *)buf);
break;
@@ -5353,8 +6062,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_FS_ALLOWED:
err = zone_set_fs_allowed(zone, (const char *)buf);
break;
- case ZONE_ATTR_PHYS_MCAP:
- err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+ case ZONE_ATTR_PMCAP_NOVER:
+ err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PMCAP_PAGEOUT:
+ err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PG_FLT_DELAY:
+ err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+ break;
+ case ZONE_ATTR_RSS:
+ err = zone_set_rss(zone, (const uint64_t *)buf);
break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
@@ -6075,6 +6793,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
zs.doi = zs32.doi;
zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
zs.flags = zs32.flags;
+ zs.zoneid = zs32.zoneid;
#else
panic("get_udatamodel() returned bogus result\n");
#endif
@@ -6085,7 +6804,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
(caddr_t)zs.rctlbuf, zs.rctlbufsz,
(caddr_t)zs.zfsbuf, zs.zfsbufsz,
zs.extended_error, zs.match, zs.doi,
- zs.label, zs.flags));
+ zs.label, zs.flags, zs.zoneid));
case ZONE_BOOT:
return (zone_boot((zoneid_t)(uintptr_t)arg1));
case ZONE_DESTROY:
@@ -6363,7 +7082,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
* zone_ki_call_zoneadmd() will do a more thorough job of this
* later.
*/
- killall(zone->zone_id);
+ killall(zone->zone_id, B_FALSE);
/*
* Now, create the thread to contact zoneadmd and do the rest of the
* work. This thread can't be created in our zone otherwise
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index ea6c93ff59..87c3d50692 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -20,6 +20,7 @@
#
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright 2012, Joyent, Inc. All rights reserved.
#
include $(SRC)/uts/Makefile.uts
@@ -278,6 +279,7 @@ CHKHDRS= \
ipc.h \
ipc_impl.h \
ipc_rctl.h \
+ ipmi.h \
isa_defs.h \
iscsi_authclient.h \
iscsi_authclientglue.h \
@@ -856,6 +858,8 @@ FSHDRS= \
hsfs_rrip.h \
hsfs_spec.h \
hsfs_susp.h \
+ hyprlofs.h \
+ hyprlofs_info.h \
lofs_info.h \
lofs_node.h \
mntdata.h \
diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h
index a9191aed7c..cb8a6012fc 100644
--- a/usr/src/uts/common/sys/buf.h
+++ b/usr/src/uts/common/sys/buf.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -186,6 +187,7 @@ struct biostats {
#define B_STARTED 0x2000000 /* io:::start probe called for buf */
#define B_ABRWRITE 0x4000000 /* Application based recovery active */
#define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */
+#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */
/*
* There is some confusion over the meaning of B_FREE and B_INVAL and what
@@ -198,6 +200,12 @@ struct biostats {
* between the sole use of these two flags. In both cases, IO will be done
* if the page is not yet committed to storage.
*
+ * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
+ * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no
+ * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then
+ * the mapping for the page is only invalidated for the current process.
+ * In this case, the page is not destroyed unless this was the final mapping.
+ *
* In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
* should be used.
*
diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h
index 6063ff4380..6bc042108c 100644
--- a/usr/src/uts/common/sys/cpucaps.h
+++ b/usr/src/uts/common/sys/cpucaps.h
@@ -22,6 +22,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_H
@@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *);
*/
extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t);
+extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t);
/*
* Get current CPU usage for a project/zone.
*/
extern rctl_qty_t cpucaps_project_get(kproject_t *);
extern rctl_qty_t cpucaps_zone_get(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_base(zone_t *);
+extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *);
/*
* Scheduling class hooks into CPU caps framework.
diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h
index 95afd21827..2cd4ed644d 100644
--- a/usr/src/uts/common/sys/cpucaps_impl.h
+++ b/usr/src/uts/common/sys/cpucaps_impl.h
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011, 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_CPUCAPS_IMPL_H
@@ -66,8 +67,12 @@ typedef struct cpucap {
waitq_t cap_waitq; /* waitq for capped threads */
kstat_t *cap_kstat; /* cpucaps specific kstat */
int64_t cap_gen; /* zone cap specific */
+ hrtime_t cap_chk_value; /* effective CPU usage cap */
hrtime_t cap_value; /* scaled CPU usage cap */
hrtime_t cap_usage; /* current CPU usage */
+ hrtime_t cap_base; /* base CPU for burst */
+ u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */
+ u_longlong_t cap_bursting; /* # of ticks currently bursting */
disp_lock_t cap_usagelock; /* protects cap_usage above */
/*
* Per cap statistics.
@@ -75,6 +80,7 @@ typedef struct cpucap {
hrtime_t cap_maxusage; /* maximum cap usage */
u_longlong_t cap_below; /* # of ticks spend below the cap */
u_longlong_t cap_above; /* # of ticks spend above the cap */
+ u_longlong_t cap_above_base; /* # of ticks spent above the base */
} cpucap_t;
/*
diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h
index 5056f9a511..914f132dc0 100644
--- a/usr/src/uts/common/sys/cred.h
+++ b/usr/src/uts/common/sys/cred.h
@@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *);
extern gid_t crgetrgid(const cred_t *);
extern gid_t crgetsgid(const cred_t *);
extern zoneid_t crgetzoneid(const cred_t *);
+extern zoneid_t crgetzonedid(const cred_t *);
extern projid_t crgetprojid(const cred_t *);
extern cred_t *crgetmapped(const cred_t *);
diff --git a/usr/src/uts/common/sys/ctf_api.h b/usr/src/uts/common/sys/ctf_api.h
index 17b0b7262e..bef3549e80 100644
--- a/usr/src/uts/common/sys/ctf_api.h
+++ b/usr/src/uts/common/sys/ctf_api.h
@@ -154,6 +154,7 @@ extern void ctf_close(ctf_file_t *);
extern ctf_file_t *ctf_parent_file(ctf_file_t *);
extern const char *ctf_parent_name(ctf_file_t *);
+extern const char *ctf_parent_label(ctf_file_t *);
extern int ctf_import(ctf_file_t *, ctf_file_t *);
extern int ctf_setmodel(ctf_file_t *, int);
diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h
index f5c990e7c0..2178ad1f0d 100644
--- a/usr/src/uts/common/sys/dktp/dadk.h
+++ b/usr/src/uts/common/sys/dktp/dadk.h
@@ -65,6 +65,8 @@ struct dadk {
kstat_t *dad_errstats; /* error stats */
kmutex_t dad_cmd_mutex;
int dad_cmd_count;
+ uint32_t dad_err_cnt; /* number of recent errors */
+ hrtime_t dad_last_log; /* time of last error log */
};
#define DAD_SECSIZ dad_phyg.g_secsiz
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index fb2a0749d3..303a9c7e45 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_DLD_H
@@ -191,6 +192,7 @@ typedef struct dld_ioc_rename {
datalink_id_t dir_linkid1;
datalink_id_t dir_linkid2;
char dir_link[MAXLINKNAMELEN];
+ boolean_t dir_zoneinit;
} dld_ioc_rename_t;
/*
@@ -203,6 +205,7 @@ typedef struct dld_ioc_rename {
typedef struct dld_ioc_zid {
zoneid_t diz_zid;
datalink_id_t diz_linkid;
+ boolean_t diz_transient;
} dld_ioc_zid_t;
/*
diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h
index 6bd2bbe35a..adcfe76c08 100644
--- a/usr/src/uts/common/sys/dls.h
+++ b/usr/src/uts/common/sys/dls.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_DLS_H
@@ -110,7 +111,7 @@ extern void dls_devnet_close(dls_dl_handle_t);
extern boolean_t dls_devnet_rebuild();
extern int dls_devnet_rename(datalink_id_t, datalink_id_t,
- const char *);
+ const char *, boolean_t);
extern int dls_devnet_create(mac_handle_t, datalink_id_t,
zoneid_t);
extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *,
@@ -127,7 +128,7 @@ extern uint16_t dls_devnet_vid(dls_dl_handle_t);
extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t);
extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *);
extern int dls_devnet_phydev(datalink_id_t, dev_t *);
-extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t);
+extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t);
extern zoneid_t dls_devnet_getzid(dls_dl_handle_t);
extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t);
extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t);
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 60f51c47b5..8f7af6856c 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_DLS_IMPL_H
@@ -96,7 +97,8 @@ extern void dls_create_str_kstats(dld_str_t *);
extern int dls_stat_update(kstat_t *, dls_link_t *, int);
extern int dls_stat_create(const char *, int, const char *,
zoneid_t, int (*)(struct kstat *, int), void *,
- kstat_t **);
+ kstat_t **, zoneid_t);
+extern void dls_stat_delete(kstat_t *);
extern int dls_devnet_open_by_dev(dev_t, dls_link_t **,
dls_dl_handle_t *);
diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h
index b4032c24d6..4f73d92118 100644
--- a/usr/src/uts/common/sys/dls_mgmt.h
+++ b/usr/src/uts/common/sys/dls_mgmt.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
*/
#ifndef _DLS_MGMT_H
@@ -165,6 +166,7 @@ typedef struct dlmgmt_door_getname {
typedef struct dlmgmt_door_getlinkid {
int ld_cmd;
char ld_link[MAXLINKNAMELEN];
+ zoneid_t ld_zoneid;
} dlmgmt_door_getlinkid_t;
typedef struct dlmgmt_door_getnext_s {
diff --git a/usr/src/uts/common/sys/dtrace.h b/usr/src/uts/common/sys/dtrace.h
index c15799a4e4..1411b8baac 100644
--- a/usr/src/uts/common/sys/dtrace.h
+++ b/usr/src/uts/common/sys/dtrace.h
@@ -1017,7 +1017,8 @@ typedef struct dtrace_fmtdesc {
#define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */
#define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */
#define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */
-#define DTRACEOPT_MAX 27 /* number of options */
+#define DTRACEOPT_ZONE 27 /* zone in which to enable probes */
+#define DTRACEOPT_MAX 28 /* number of options */
#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h
new file mode 100644
index 0000000000..b8c4149df2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs.h
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_HYPRLOFS_H
+#define _SYS_FS_HYPRLOFS_H
+
+#include <sys/param.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hyprlofs ioctl numbers.
+ */
+#define HYPRLOFS_IOC ('H' << 8)
+
+#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1)
+#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2)
+#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3)
+#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4)
+
+typedef struct {
+ char *hle_path;
+ uint_t hle_plen;
+ char *hle_name;
+ uint_t hle_nlen;
+} hyprlofs_entry_t;
+
+typedef struct {
+ hyprlofs_entry_t *hle_entries;
+ uint_t hle_len;
+} hyprlofs_entries_t;
+
+typedef struct {
+ char hce_path[MAXPATHLEN];
+ char hce_name[MAXPATHLEN];
+} hyprlofs_curr_entry_t;
+
+typedef struct {
+ hyprlofs_curr_entry_t *hce_entries;
+ uint_t hce_cnt;
+} hyprlofs_curr_entries_t;
+
+#ifdef _KERNEL
+typedef struct {
+ caddr32_t hle_path;
+ uint_t hle_plen;
+ caddr32_t hle_name;
+ uint_t hle_nlen;
+} hyprlofs_entry32_t;
+
+typedef struct {
+ caddr32_t hle_entries;
+ uint_t hle_len;
+} hyprlofs_entries32_t;
+
+typedef struct {
+ caddr32_t hce_entries;
+ uint_t hce_cnt;
+} hyprlofs_curr_entries32_t;
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_HYPRLOFS_H */
diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h
new file mode 100644
index 0000000000..29bdadc4e2
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_HLOFS_INFO_H
+#define _SYS_FS_HLOFS_INFO_H
+
+#include <sys/t_lock.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <sys/vfs_opreg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * hlnode is the file system dependent node for hyprlofs.
+ * It is modeled on the tmpfs tmpnode.
+ *
+ * hln_rwlock protects access of the directory list at hln_dir
+ * as well as syncronizing read/writes to directory hlnodes.
+ * hln_tlock protects updates to hln_mode and hln_nlink.
+ * hln_tlock doesn't require any hlnode locks.
+ */
+typedef struct hlnode {
+ struct hlnode *hln_back; /* linked list of hlnodes */
+ struct hlnode *hln_forw; /* linked list of hlnodes */
+ union {
+ struct {
+ struct hldirent *un_dirlist; /* dirent list */
+ uint_t un_dirents; /* number of dirents */
+ } un_dirstruct;
+ vnode_t *un_realvp; /* real vnode */
+ } un_hlnode;
+ vnode_t *hln_vnode; /* vnode for this hlnode */
+ int hln_gen; /* pseudo gen num for hlfid */
+ int hln_looped; /* flag indicating loopback */
+ vattr_t hln_attr; /* attributes */
+ krwlock_t hln_rwlock; /* rw - serialize mods and */
+ /* directory updates */
+ kmutex_t hln_tlock; /* time, flag, and nlink lock */
+} hlnode_t;
+
+/*
+ * hyprlofs per-mount data structure.
+ * All fields are protected by hlm_contents.
+ */
+typedef struct {
+ vfs_t *hlm_vfsp; /* filesystem's vfs struct */
+ hlnode_t *hlm_rootnode; /* root hlnode */
+ char *hlm_mntpath; /* name of hyprlofs mount point */
+ dev_t hlm_dev; /* unique dev # of mounted `device' */
+ uint_t hlm_gen; /* pseudo generation number for files */
+ kmutex_t hlm_contents; /* lock for hlfsmount structure */
+} hlfsmount_t;
+
+/*
+ * hyprlofs directories are made up of a linked list of hldirent structures
+ * hanging off directory hlnodes. File names are not fixed length,
+ * but are null terminated.
+ */
+typedef struct hldirent {
+ hlnode_t *hld_hlnode; /* hlnode for this file */
+ struct hldirent *hld_next; /* next directory entry */
+ struct hldirent *hld_prev; /* prev directory entry */
+ uint_t hld_offset; /* "offset" of dir entry */
+ uint_t hld_hash; /* a hash of td_name */
+ struct hldirent *hld_link; /* linked via the hash table */
+ hlnode_t *hld_parent; /* parent, dir we are in */
+ char *hld_name; /* must be null terminated */
+ /* max length is MAXNAMELEN */
+} hldirent_t;
+
+/*
+ * hlfid overlays the fid structure (for VFS_VGET)
+ */
+typedef struct {
+ uint16_t hlfid_len;
+ ino32_t hlfid_ino;
+ int32_t hlfid_gen;
+} hlfid_t;
+
+/*
+ * File system independent to hyprlofs conversion macros
+ */
+#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data)
+#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data)
+#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data)
+#define HLNTOV(tp) ((tp)->hln_vnode)
+#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp)
+#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp))
+#define hlnode_rele(tp) VN_RELE(HLNTOV(tp))
+
+#define hln_dir un_hlnode.un_dirstruct.un_dirlist
+#define hln_dirents un_hlnode.un_dirstruct.un_dirents
+#define hln_realvp un_hlnode.un_realvp
+
+/*
+ * Attributes
+ */
+#define hln_mask hln_attr.va_mask
+#define hln_type hln_attr.va_type
+#define hln_mode hln_attr.va_mode
+#define hln_uid hln_attr.va_uid
+#define hln_gid hln_attr.va_gid
+#define hln_fsid hln_attr.va_fsid
+#define hln_nodeid hln_attr.va_nodeid
+#define hln_nlink hln_attr.va_nlink
+#define hln_size hln_attr.va_size
+#define hln_atime hln_attr.va_atime
+#define hln_mtime hln_attr.va_mtime
+#define hln_ctime hln_attr.va_ctime
+#define hln_rdev hln_attr.va_rdev
+#define hln_blksize hln_attr.va_blksize
+#define hln_nblocks hln_attr.va_nblocks
+#define hln_seq hln_attr.va_seq
+
+#define HL_MUSTHAVE 1
+
+/*
+ * enums
+ */
+enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */
+enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */
+
+/*
+ * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs
+ * leaves free for the rest of the system. The default value for
+ * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a
+ * different number of pages. Since hyprlofs doesn't actually use much
+ * memory, its unlikely this ever needs to be patched.
+ */
+#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */
+
+extern size_t hyprlofs_minfree; /* Anonymous memory in pages */
+
+/*
+ * hyprlofs can allocate only a certain percentage of kernel memory,
+ * which is used for hlnodes, directories, file names, etc.
+ * This is statically set as HYPRLOFSMAXFRACKMEM of physical memory.
+ * The actual number of allocatable bytes can be patched in hyprlofs_maxkmem.
+ */
+#define HYPRLOFSMAXFRACKMEM 25 /* 1/25 of physical memory */
+
+extern size_t hyprlofs_kmemspace;
+extern size_t hyprlofs_maxkmem; /* Allocatable kernel memory in bytes */
+
+extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *,
+ cred_t *);
+extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *);
+extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op,
+ cred_t *);
+extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *);
+extern void hyprlofs_dirtrunc(hlnode_t *);
+extern void *hyprlofs_memalloc(size_t, int);
+extern void hyprlofs_memfree(void *, size_t);
+extern int hyprlofs_taccess(void *, int, cred_t *);
+extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op,
+ vnode_t *, vattr_t *, hlnode_t **, cred_t *);
+
+extern struct vnodeops *hyprlofs_vnodeops;
+extern const struct fs_operation_def hyprlofs_vnodeops_template[];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_HLOFS_INFO_H */
diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h
index 583586fd75..cdb47beb7f 100644
--- a/usr/src/uts/common/sys/fss.h
+++ b/usr/src/uts/common/sys/fss.h
@@ -22,6 +22,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_FSS_H
@@ -86,6 +87,7 @@ typedef struct fsspset {
/* on the list */
struct fssproj *fssps_list; /* list of project parts */
struct fsszone *fssps_zones; /* list of fsszone_t's in pset */
+ uint32_t fssps_gen; /* generation for zone's kstats */
} fsspset_t;
/*
@@ -103,6 +105,8 @@ typedef struct fssproj {
/* protected by fssps_displock */
uint32_t fssp_ticks; /* total of all ticks */
/* protected by fssps_displock */
+ uint32_t fssp_zone_ticks; /* unscaled total of all ticks */
+ /* protected by fssps_displock */
fssusage_t fssp_usage; /* this project's decayed usage */
fssusage_t fssp_shusage; /* normalized usage */
struct fssproj *fssp_next; /* next project on this pset */
diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h
new file mode 100644
index 0000000000..2838ea1c4f
--- /dev/null
+++ b/usr/src/uts/common/sys/ipd.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * These definitions are private to ipd and ipdadm.
+ */
+
+#ifndef _SYS_IPD_H
+#define _SYS_IPD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IPD_DEV_PATH "/dev/ipd"
+#define IPD_MAX_DELAY 1000000 /* 1s in us */
+
+typedef struct ipd_ioc_perturb {
+ zoneid_t ipip_zoneid;
+ uint32_t ipip_arg;
+} ipd_ioc_perturb_t;
+
+typedef struct ipd_ioc_info {
+ zoneid_t ipii_zoneid;
+ uint32_t ipii_corrupt;
+ uint32_t ipii_drop;
+ uint32_t ipii_delay;
+} ipd_ioc_info_t;
+
+#ifdef _KERNEL
+
+typedef struct ipd_ioc_list32 {
+ uint_t ipil_nzones;
+ caddr32_t ipil_list;
+} ipd_ioc_list32_t;
+
+#endif /* _KERNEL */
+
+typedef struct ipd_ioc_list {
+ uint_t ipil_nzones;
+ zoneid_t *ipil_list;
+} ipd_ioc_list_t;
+
+#define IPD_CORRUPT 0x1
+#define IPD_DELAY 0x2
+#define IPD_DROP 0x4
+
+#define IPDIOC (('i' << 24) | ('p' << 16) | ('d' << 8))
+#define IPDIOC_CORRUPT (IPDIOC | 1) /* disable ipd */
+#define IPDIOC_DELAY (IPDIOC | 2) /* disable ipd */
+#define IPDIOC_DROP (IPDIOC | 3) /* disable ipd */
+#define IPDIOC_LIST (IPDIOC | 4) /* enable ipd */
+#define IPDIOC_INFO (IPDIOC | 5) /* disable ipd */
+#define IPDIOC_REMOVE (IPDIOC | 6) /* disable ipd */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_IPD_H */
diff --git a/usr/src/uts/common/sys/ipmi.h b/usr/src/uts/common/sys/ipmi.h
new file mode 100644
index 0000000000..9dafac407d
--- /dev/null
+++ b/usr/src/uts/common/sys/ipmi.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2006 IronPort Systems Inc. <ambrisko@ironport.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/sys/ipmi.h,v 1.2 2006/09/22 22:11:29 jhb Exp $
+ */
+
+/*
+ * Copyright 2012 Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_IPMI_H_
+#define _SYS_IPMI_H_
+
+#include <sys/types.h>
+#include <sys/ioccom.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IPMI_MAX_ADDR_SIZE 0x20
+#define IPMI_MAX_RX 1024
+#define IPMI_BMC_SLAVE_ADDR 0x20 /* Linux Default slave address */
+#define IPMI_BMC_CHANNEL 0x0f /* Linux BMC channel */
+
+#define IPMI_BMC_SMS_LUN 0x02
+
+#define IPMI_SYSTEM_INTERFACE_ADDR_TYPE 0x0c
+#define IPMI_IPMB_ADDR_TYPE 0x01
+#define IPMI_IPMB_BROADCAST_ADDR_TYPE 0x41
+
+#define IPMI_IOC_MAGIC 'i'
+#define IPMICTL_RECEIVE_MSG_TRUNC \
+ _IOWR(IPMI_IOC_MAGIC, 11, struct ipmi_recv)
+#define IPMICTL_RECEIVE_MSG \
+ _IOWR(IPMI_IOC_MAGIC, 12, struct ipmi_recv)
+#define IPMICTL_SEND_COMMAND \
+ _IOW(IPMI_IOC_MAGIC, 13, struct ipmi_req)
+#define IPMICTL_REGISTER_FOR_CMD \
+ _IOW(IPMI_IOC_MAGIC, 14, struct ipmi_cmdspec)
+#define IPMICTL_UNREGISTER_FOR_CMD \
+ _IOW(IPMI_IOC_MAGIC, 15, struct ipmi_cmdspec)
+#define IPMICTL_SET_GETS_EVENTS_CMD _IOW(IPMI_IOC_MAGIC, 16, int)
+#define IPMICTL_SET_MY_ADDRESS_CMD _IOW(IPMI_IOC_MAGIC, 17, unsigned int)
+#define IPMICTL_GET_MY_ADDRESS_CMD _IOR(IPMI_IOC_MAGIC, 18, unsigned int)
+#define IPMICTL_SET_MY_LUN_CMD _IOW(IPMI_IOC_MAGIC, 19, unsigned int)
+#define IPMICTL_GET_MY_LUN_CMD _IOR(IPMI_IOC_MAGIC, 20, unsigned int)
+
+#define IPMI_RESPONSE_RECV_TYPE 1
+#define IPMI_ASYNC_EVENT_RECV_TYPE 2
+#define IPMI_CMD_RECV_TYPE 3
+
+#define IPMI_APP_REQUEST 0x06
+#define IPMI_GET_DEVICE_ID 0x01
+#define IPMI_CLEAR_FLAGS 0x30
+#define IPMI_GET_MSG_FLAGS 0x31
+#define IPMI_MSG_AVAILABLE 0x01
+#define IPMI_MSG_BUFFER_FULL 0x02
+#define IPMI_WDT_PRE_TIMEOUT 0x08
+#define IPMI_GET_MSG 0x33
+#define IPMI_SEND_MSG 0x34
+#define IPMI_GET_CHANNEL_INFO 0x42
+#define IPMI_RESET_WDOG 0x22
+#define IPMI_SET_WDOG 0x24
+#define IPMI_GET_WDOG 0x25
+
+#define IPMI_SET_WD_TIMER_SMS_OS 0x04
+#define IPMI_SET_WD_TIMER_DONT_STOP 0x40
+#define IPMI_SET_WD_ACTION_RESET 0x01
+
+struct ipmi_msg {
+ unsigned char netfn;
+ unsigned char cmd;
+ unsigned short data_len;
+ unsigned char *data;
+};
+
+struct ipmi_req {
+ unsigned char *addr;
+ unsigned int addr_len;
+ long msgid;
+ struct ipmi_msg msg;
+};
+
+struct ipmi_recv {
+ int recv_type;
+ unsigned char *addr;
+ unsigned int addr_len;
+ long msgid;
+ struct ipmi_msg msg;
+};
+
+struct ipmi_cmdspec {
+ unsigned char netfn;
+ unsigned char cmd;
+};
+
+struct ipmi_addr {
+ int addr_type;
+ short channel;
+ unsigned char data[IPMI_MAX_ADDR_SIZE];
+};
+
+struct ipmi_system_interface_addr {
+ int addr_type;
+ short channel;
+ unsigned char lun;
+};
+
+struct ipmi_ipmb_addr {
+ int addr_type;
+ short channel;
+ unsigned char slave_addr;
+ unsigned char lun;
+};
+
+#ifdef _KERNEL
+
+#define IPMICTL_RECEIVE_MSG_TRUNC_32 \
+ _IOWR(IPMI_IOC_MAGIC, 11, struct ipmi_recv32)
+#define IPMICTL_RECEIVE_MSG_32 \
+ _IOWR(IPMI_IOC_MAGIC, 12, struct ipmi_recv32)
+#define IPMICTL_SEND_COMMAND_32 \
+ _IOW(IPMI_IOC_MAGIC, 13, struct ipmi_req32)
+
+struct ipmi_msg32 {
+ unsigned char netfn;
+ unsigned char cmd;
+ unsigned short data_len;
+ uint32_t data;
+};
+
+struct ipmi_req32 {
+ uint32_t addr;
+ unsigned int addr_len;
+ int32_t msgid;
+ struct ipmi_msg32 msg;
+};
+
+struct ipmi_recv32 {
+ int recv_type;
+ uint32_t addr;
+ unsigned int addr_len;
+ int32_t msgid;
+ struct ipmi_msg32 msg;
+};
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_IPMI_H_ */
diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h
index 6c9119e56d..82344607b0 100644
--- a/usr/src/uts/common/sys/mman.h
+++ b/usr/src/uts/common/sys/mman.h
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -353,6 +354,7 @@ struct memcntl_mha32 {
#define MS_SYNC 0x4 /* wait for msync */
#define MS_ASYNC 0x1 /* return immediately */
#define MS_INVALIDATE 0x2 /* invalidate caches */
+#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */
#if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__)
/* functions to mctl */
diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h
index e95ef3fccc..d215d88790 100644
--- a/usr/src/uts/common/sys/mntent.h
+++ b/usr/src/uts/common/sys/mntent.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*
* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
* All Rights Reserved
@@ -47,6 +48,7 @@ extern "C" {
#define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */
#define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */
#define MNTTYPE_LOFS "lofs" /* Loop back file system */
+#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */
#define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */
#define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */
#define MNTTYPE_SWAP "swap" /* Swap file system */
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index bcd5ba2b4c..819c788b9e 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_POLICY_H
@@ -171,6 +172,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *,
const vattr_t *, cred_t *);
int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t);
int secpolicy_xvm_control(const cred_t *);
+int secpolicy_hyprlofs_control(const cred_t *);
int secpolicy_basic_exec(const cred_t *, vnode_t *);
int secpolicy_basic_fork(const cred_t *);
diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h
index b320836182..12a6925368 100644
--- a/usr/src/uts/common/sys/procfs.h
+++ b/usr/src/uts/common/sys/procfs.h
@@ -62,10 +62,6 @@ extern "C" {
#include <sys/procfs_isa.h>
#include <sys/priv.h>
-#if !defined(_LP64) && _FILE_OFFSET_BITS == 64
-#error "Cannot use procfs in the large file compilation environment"
-#endif
-
/*
* System call interfaces for /proc.
*/
diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h
index 188230d61e..c7f460e7c7 100644
--- a/usr/src/uts/common/sys/thread.h
+++ b/usr/src/uts/common/sys/thread.h
@@ -68,6 +68,8 @@ typedef struct ctxop {
void (*free_op)(void *, int); /* function which frees the context */
void *arg; /* argument to above functions, ctx pointer */
struct ctxop *next; /* next context ops */
+ hrtime_t save_ts; /* timestamp of last save */
+ hrtime_t restore_ts; /* timestamp of last restore */
} ctxop_t;
/*
diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h
index c35d0a5cfb..6adeb477bb 100644
--- a/usr/src/uts/common/sys/uadmin.h
+++ b/usr/src/uts/common/sys/uadmin.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -157,7 +158,7 @@ extern kmutex_t ualock;
extern void mdboot(int, int, char *, boolean_t);
extern void mdpreboot(int, int, char *);
extern int kadmin(int, int, void *, cred_t *);
-extern void killall(zoneid_t);
+extern void killall(zoneid_t, boolean_t);
#endif
#if defined(__STDC__)
diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h
index 1aa4a8ee6d..97e3430ae2 100644
--- a/usr/src/uts/common/sys/vm_usage.h
+++ b/usr/src/uts/common/sys/vm_usage.h
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_VM_USAGE_H
@@ -79,8 +80,9 @@ extern "C" {
/* zoneid */
#define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */
/* euser */
+#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */
-#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */
+#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */
typedef struct vmusage {
id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 3ba7bf47f4..11e0622c8d 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#ifndef _SYS_ZONE_H
@@ -94,12 +95,17 @@ extern "C" {
#define ZONE_ATTR_INITNAME 9
#define ZONE_ATTR_BOOTARGS 10
#define ZONE_ATTR_BRAND 11
-#define ZONE_ATTR_PHYS_MCAP 12
+#define ZONE_ATTR_PMCAP_NOVER 12
#define ZONE_ATTR_SCHED_CLASS 13
#define ZONE_ATTR_FLAGS 14
#define ZONE_ATTR_HOSTID 15
#define ZONE_ATTR_FS_ALLOWED 16
#define ZONE_ATTR_NETWORK 17
+#define ZONE_ATTR_DID 18
+#define ZONE_ATTR_PMCAP_PAGEOUT 19
+#define ZONE_ATTR_INITNORESTART 20
+#define ZONE_ATTR_PG_FLT_DELAY 21
+#define ZONE_ATTR_RSS 22
/* Start of the brand-specific attribute namespace */
#define ZONE_ATTR_BRAND_ATTRS 32768
@@ -180,6 +186,7 @@ typedef struct {
uint32_t doi; /* DOI for label */
caddr32_t label; /* label associated with zone */
int flags;
+ zoneid_t zoneid; /* requested zoneid */
} zone_def32;
#endif
typedef struct {
@@ -196,6 +203,7 @@ typedef struct {
uint32_t doi; /* DOI for label */
const bslabel_t *label; /* label associated with zone */
int flags;
+ zoneid_t zoneid; /* requested zoneid */
} zone_def;
/* extended error information */
@@ -240,7 +248,7 @@ typedef enum zone_cmd {
typedef struct zone_cmd_arg {
uint64_t uniqid; /* unique "generation number" */
zone_cmd_t cmd; /* requested action */
- uint32_t _pad; /* need consistent 32/64 bit alignmt */
+ uint32_t debug; /* enable brand hook debug */
char locale[MAXPATHLEN]; /* locale in which to render messages */
char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */
} zone_cmd_arg_t;
@@ -320,6 +328,7 @@ typedef struct zone_net_data {
* libraries which may be defining ther own versions.
*/
#include <sys/list.h>
+#include <sys/cpuvar.h>
#define GLOBAL_ZONEUNIQID 0 /* uniqid of the global zone */
@@ -367,7 +376,7 @@ typedef struct zone_dataset {
} zone_dataset_t;
/*
- * structure for zone kstats
+ * structure for rctl zone kstats
*/
typedef struct zone_kstat {
kstat_named_t zk_zonename;
@@ -377,6 +386,73 @@ typedef struct zone_kstat {
struct cpucap;
+typedef struct {
+ hrtime_t cycle_start;
+ uint_t cycle_cnt;
+ hrtime_t zone_avg_cnt;
+} sys_zio_cntr_t;
+
+typedef struct {
+ kstat_named_t zv_zonename;
+ kstat_named_t zv_nread;
+ kstat_named_t zv_reads;
+ kstat_named_t zv_rtime;
+ kstat_named_t zv_rlentime;
+ kstat_named_t zv_nwritten;
+ kstat_named_t zv_writes;
+ kstat_named_t zv_wtime;
+ kstat_named_t zv_wlentime;
+ kstat_named_t zv_10ms_ops;
+ kstat_named_t zv_100ms_ops;
+ kstat_named_t zv_1s_ops;
+ kstat_named_t zv_10s_ops;
+ kstat_named_t zv_delay_cnt;
+ kstat_named_t zv_delay_time;
+} zone_vfs_kstat_t;
+
+typedef struct {
+ kstat_named_t zz_zonename;
+ kstat_named_t zz_nread;
+ kstat_named_t zz_reads;
+ kstat_named_t zz_rtime;
+ kstat_named_t zz_rlentime;
+ kstat_named_t zz_nwritten;
+ kstat_named_t zz_writes;
+ kstat_named_t zz_waittime;
+} zone_zfs_kstat_t;
+
+typedef struct {
+ kstat_named_t zm_zonename;
+ kstat_named_t zm_rss;
+ kstat_named_t zm_phys_cap;
+ kstat_named_t zm_swap;
+ kstat_named_t zm_swap_cap;
+ kstat_named_t zm_nover;
+ kstat_named_t zm_pagedout;
+ kstat_named_t zm_pgpgin;
+ kstat_named_t zm_anonpgin;
+ kstat_named_t zm_execpgin;
+ kstat_named_t zm_fspgin;
+ kstat_named_t zm_anon_alloc_fail;
+ kstat_named_t zm_pf_throttle;
+ kstat_named_t zm_pf_throttle_usec;
+} zone_mcap_kstat_t;
+
+typedef struct {
+ kstat_named_t zm_zonename; /* full name, kstat truncates name */
+ kstat_named_t zm_utime;
+ kstat_named_t zm_stime;
+ kstat_named_t zm_wtime;
+ kstat_named_t zm_avenrun1;
+ kstat_named_t zm_avenrun5;
+ kstat_named_t zm_avenrun15;
+ kstat_named_t zm_run_ticks;
+ kstat_named_t zm_run_wait;
+ kstat_named_t zm_fss_shr_pct;
+ kstat_named_t zm_fss_pri_hi;
+ kstat_named_t zm_fss_pri_avg;
+} zone_misc_kstat_t;
+
typedef struct zone {
/*
* zone_name is never modified once set.
@@ -416,6 +492,7 @@ typedef struct zone {
*/
list_node_t zone_linkage;
zoneid_t zone_id; /* ID of zone */
+ zoneid_t zone_did; /* persistent debug ID of zone */
uint_t zone_ref; /* count of zone_hold()s on zone */
uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */
/*
@@ -471,7 +548,7 @@ typedef struct zone {
char *zone_initname; /* fs path to 'init' */
int zone_boot_err; /* for zone_boot() if boot fails */
char *zone_bootargs; /* arguments passed via zone_boot() */
- uint64_t zone_phys_mcap; /* physical memory cap */
+ rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */
/*
* zone_kthreads is protected by zone_status_lock.
*/
@@ -490,6 +567,9 @@ typedef struct zone {
hrtime_t zone_pool_mod; /* last pool bind modification time */
/* zone_psetid is protected by cpu_lock */
psetid_t zone_psetid; /* pset the zone is bound to */
+
+ time_t zone_boot_time; /* Similar to boot_time */
+
/*
* The following two can be read without holding any locks. They are
* updated under cpu_lock.
@@ -517,6 +597,37 @@ typedef struct zone {
list_t zone_dl_list;
netstack_t *zone_netstack;
struct cpucap *zone_cpucap; /* CPU caps data */
+
+ /*
+ * Data and counters used for ZFS fair-share disk IO.
+ */
+ rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */
+ uint_t zone_zfs_queued; /* enqueued count */
+ uint64_t zone_zfs_weight; /* used to prevent starvation */
+ uint64_t zone_io_util; /* IO utilization metric */
+ boolean_t zone_io_util_above_avg; /* IO util percent > avg. */
+ uint16_t zone_io_delay; /* IO delay on logical r/w */
+ kmutex_t zone_stg_io_lock; /* protects IO window data */
+ sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */
+ sys_zio_cntr_t zone_wr_ops; /* writes and logical writes. */
+ sys_zio_cntr_t zone_lwr_ops;
+
+ /*
+ * kstats and counters for VFS ops and bytes.
+ */
+ kmutex_t zone_vfs_lock; /* protects VFS statistics */
+ kstat_t *zone_vfs_ksp;
+ kstat_io_t zone_vfs_rwstats;
+ zone_vfs_kstat_t *zone_vfs_stats;
+
+ /*
+ * kstats for ZFS I/O ops and bytes.
+ */
+ kmutex_t zone_zfs_lock; /* protects ZFS statistics */
+ kstat_t *zone_zfs_ksp;
+ kstat_io_t zone_zfs_rwstats;
+ zone_zfs_kstat_t *zone_zfs_stats;
+
/*
* Solaris Auditing per-zone audit context
*/
@@ -534,6 +645,64 @@ typedef struct zone {
rctl_qty_t zone_nprocs_ctl; /* current limit protected by */
/* zone_rctls->rcs_lock */
kstat_t *zone_nprocs_kstat;
+
+ /*
+ * kstats and counters for physical memory capping.
+ */
+ rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */
+ kstat_t *zone_physmem_kstat;
+ uint64_t zone_mcap_nover; /* # of times over phys. cap */
+ uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */
+ kmutex_t zone_mcap_lock; /* protects mcap statistics */
+ kstat_t *zone_mcap_ksp;
+ zone_mcap_kstat_t *zone_mcap_stats;
+ uint64_t zone_pgpgin; /* pages paged in */
+ uint64_t zone_anonpgin; /* anon pages paged in */
+ uint64_t zone_execpgin; /* exec pages paged in */
+ uint64_t zone_fspgin; /* fs pages paged in */
+ uint64_t zone_anon_alloc_fail; /* cnt of anon alloc fails */
+ uint64_t zone_pf_throttle; /* cnt of page flt throttles */
+ uint64_t zone_pf_throttle_usec; /* time of page flt throttles */
+
+ /* Num usecs to throttle page fault when zone is over phys. mem cap */
+ uint32_t zone_pg_flt_delay;
+
+ /*
+ * Misc. kstats and counters for zone cpu-usage aggregation.
+ * The zone_Xtime values are the sum of the micro-state accounting
+ * values for all threads that are running or have run in the zone.
+ * This is tracked in msacct.c as threads change state.
+ * The zone_stime is the sum of the LMS_SYSTEM times.
+ * The zone_utime is the sum of the LMS_USER times.
+ * The zone_wtime is the sum of the LMS_WAIT_CPU times.
+ * As with per-thread micro-state accounting values, these values are
+ * not scaled to nanosecs. The scaling is done by the
+ * zone_misc_kstat_update function when kstats are requested.
+ */
+ kmutex_t zone_misc_lock; /* protects misc statistics */
+ kstat_t *zone_misc_ksp;
+ zone_misc_kstat_t *zone_misc_stats;
+ uint64_t zone_stime; /* total system time */
+ uint64_t zone_utime; /* total user time */
+ uint64_t zone_wtime; /* total time waiting in runq */
+
+ struct loadavg_s zone_loadavg; /* loadavg for this zone */
+ uint64_t zone_hp_avenrun[3]; /* high-precision avenrun */
+ int zone_avenrun[3]; /* FSCALED avg. run queue len */
+
+ /*
+ * FSS stats updated once per second by fss_decay_usage.
+ * zone_runq_cntr is an instantaneous accumulation of the number of
+ * processes in the run queue per project and is not computed over the
+ * one second interval.
+ */
+ uint32_t zone_fss_gen; /* FSS generation cntr */
+ uint32_t zone_proc_cnt; /* FSS process cntr */
+ uint64_t zone_run_ticks; /* tot # of ticks running */
+ uint64_t zone_runq_cntr; /* tot # of procs in runq */
+ uint32_t zone_fss_shr_pct; /* fss active shr % in intvl */
+ uint64_t zone_fss_pri_hi; /* fss high pri this interval */
+ uint64_t zone_fss_pri_avg; /* fss avg pri this interval */
} zone_t;
/*
@@ -566,9 +735,11 @@ extern zone_t *zone_find_by_name(char *);
extern zone_t *zone_find_by_any_path(const char *, boolean_t);
extern zone_t *zone_find_by_path(const char *);
extern zoneid_t getzoneid(void);
+extern zoneid_t getzonedid(void);
extern zone_t *zone_find_by_id_nolock(zoneid_t);
extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *);
extern int zone_check_datalink(zoneid_t *, datalink_id_t);
+extern void zone_loadavg_update();
/*
* Zone-specific data (ZSD) APIs
@@ -759,6 +930,7 @@ extern int zone_walk(int (*)(zone_t *, void *), void *);
extern rctl_hndl_t rc_zone_locked_mem;
extern rctl_hndl_t rc_zone_max_swap;
+extern rctl_hndl_t rc_zone_phys_mem;
extern rctl_hndl_t rc_zone_max_lofi;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/syscall/getloadavg.c b/usr/src/uts/common/syscall/getloadavg.c
index c669f9b8ba..0f44064e90 100644
--- a/usr/src/uts/common/syscall/getloadavg.c
+++ b/usr/src/uts/common/syscall/getloadavg.c
@@ -22,10 +22,9 @@
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/errno.h>
@@ -41,7 +40,6 @@ int
getloadavg(int *buf, int nelem)
{
int *loadbuf = &avenrun[0];
- int loadavg[LOADAVG_NSTATS];
int error;
if (nelem < 0)
@@ -50,15 +48,7 @@ getloadavg(int *buf, int nelem)
nelem = LOADAVG_NSTATS;
if (!INGLOBALZONE(curproc)) {
- mutex_enter(&cpu_lock);
- if (pool_pset_enabled()) {
- psetid_t psetid = zone_pset_get(curproc->p_zone);
-
- error = cpupart_get_loadavg(psetid, &loadavg[0], nelem);
- ASSERT(error == 0); /* pset isn't going anywhere */
- loadbuf = &loadavg[0];
- }
- mutex_exit(&cpu_lock);
+ loadbuf = &curproc->p_zone->zone_avenrun[0];
}
error = copyout(loadbuf, buf, nelem * sizeof (avenrun[0]));
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
index 1ab3a8b65e..63c8b64ad0 100644
--- a/usr/src/uts/common/syscall/memcntl.c
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -21,6 +21,7 @@
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -116,13 +117,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
* MS_SYNC used to be defined to be zero but is now non-zero.
* For binary compatibility we still accept zero
* (the absence of MS_ASYNC) to mean the same thing.
+ * Binary compatibility is not an issue for MS_INVALCURPROC.
*/
iarg = (uintptr_t)arg;
if ((iarg & ~MS_INVALIDATE) == 0)
iarg |= MS_SYNC;
- if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
- ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+ if (((iarg &
+ ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
+ ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
+ ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
+ (MS_INVALIDATE|MS_INVALCURPROC))) {
error = set_errno(EINVAL);
} else {
error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
index 471c66ff32..e68f640045 100644
--- a/usr/src/uts/common/syscall/sysconfig.c
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -158,8 +159,8 @@ sysconfig(int which)
* even though rcapd can be used on the global zone too.
*/
if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mcap != 0)
- return (MIN(btop(curproc->p_zone->zone_phys_mcap),
+ curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX)
+ return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl),
physinstalled));
return (physinstalled);
@@ -167,26 +168,23 @@ sysconfig(int which)
case _CONFIG_AVPHYS_PAGES:
/*
* If the non-global zone has a phys. memory cap, use
- * the phys. memory cap - zone's current rss. We always
+ * the phys. memory cap - zone's rss. We always
* report the system-wide value for the global zone, even
- * though rcapd can be used on the global zone too.
+ * though memory capping can be used on the global zone too.
+ * We use the cached value for the RSS since vm_getusage()
+ * is so expensive and we don't need this value to be exact.
*/
if (!INGLOBALZONE(curproc) &&
- curproc->p_zone->zone_phys_mcap != 0) {
+ curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) {
pgcnt_t cap, rss, free;
- vmusage_t in_use;
- size_t cnt = 1;
- cap = btop(curproc->p_zone->zone_phys_mcap);
+ cap = btop(curproc->p_zone->zone_phys_mem_ctl);
if (cap > physinstalled)
return (freemem);
- if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt,
- FKIOCTL) != 0)
- in_use.vmu_rss_all = 0;
- rss = btop(in_use.vmu_rss_all);
+ rss = btop(curproc->p_zone->zone_phys_mem);
/*
- * Because rcapd implements a soft cap, it is possible
+ * Because this is a soft cap, it is possible
* for rss to be temporarily over the cap.
*/
if (cap > rss)
diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c
index 1bdfbbfd0b..dbff1b637c 100644
--- a/usr/src/uts/common/syscall/uadmin.c
+++ b/usr/src/uts/common/syscall/uadmin.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -76,7 +77,7 @@ volatile int fastreboot_dryrun = 0;
* system with many zones.
*/
void
-killall(zoneid_t zoneid)
+killall(zoneid_t zoneid, boolean_t force)
{
proc_t *p;
@@ -106,7 +107,7 @@ killall(zoneid_t zoneid)
p->p_stat != SIDL &&
p->p_stat != SZOMB) {
mutex_enter(&p->p_lock);
- if (sigismember(&p->p_sig, SIGKILL)) {
+ if (!force && sigismember(&p->p_sig, SIGKILL)) {
mutex_exit(&p->p_lock);
p = p->p_next;
} else {
@@ -243,7 +244,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
*/
zone_shutdown_global();
- killall(ALL_ZONES);
+ killall(ALL_ZONES, B_FALSE);
/*
* If we are calling kadmin() from a kernel context then we
* do not release these resources.
diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h
index 1d91475e38..156b810046 100644
--- a/usr/src/uts/common/vm/hat.h
+++ b/usr/src/uts/common/vm/hat.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012 Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -460,6 +461,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t);
*/
#define HAT_ADV_PGUNLOAD 0x00
#define HAT_FORCE_PGUNLOAD 0x01
+#define HAT_CURPROC_PGUNLOAD 0x02
/*
* Attributes for hat_page_*attr, hat_setstats and
diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c
index 31c293d416..5f106f6c06 100644
--- a/usr/src/uts/common/vm/seg_vn.c
+++ b/usr/src/uts/common/vm/seg_vn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -7254,7 +7255,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = svd->vpage;
offset = svd->offset + (uintptr_t)(addr - seg->s_base);
bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
- ((flags & MS_INVALIDATE) ? B_INVAL : 0);
+ ((flags & MS_INVALIDATE) ? B_INVAL : 0) |
+ ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0);
if (attr) {
pageprot = attr & ~(SHARED|PRIVATE);
@@ -7279,11 +7281,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
vpp = &svd->vpage[seg_page(seg, addr)];
} else if (svd->vp && svd->amp == NULL &&
- (flags & MS_INVALIDATE) == 0) {
+ (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) {
/*
- * No attributes, no anonymous pages and MS_INVALIDATE flag
- * is not on, just use one big request.
+ * No attributes, no anonymous pages and MS_INVAL* flags
+ * are not on, just use one big request.
*/
err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
bflags, svd->cred, NULL);
@@ -7335,7 +7337,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
* might race in and lock the page after we unlock and before
* we do the PUTPAGE, then PUTPAGE simply does nothing.
*/
- if (flags & MS_INVALIDATE) {
+ if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) {
if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
page_unlock(pp);
diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c
index fdf9f7790c..f30ba7ef2e 100644
--- a/usr/src/uts/common/vm/vm_anon.c
+++ b/usr/src/uts/common/vm/vm_anon.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1986, 2010, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -792,6 +793,7 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
mutex_enter(&p->p_lock);
if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
mutex_exit(&p->p_lock);
+ atomic_add_64(&zone->zone_anon_alloc_fail, 1);
return (0);
}
mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c
index 01ad32e0b1..8caa257486 100644
--- a/usr/src/uts/common/vm/vm_as.c
+++ b/usr/src/uts/common/vm/vm_as.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -56,6 +57,7 @@
#include <sys/debug.h>
#include <sys/tnf_probe.h>
#include <sys/vtrace.h>
+#include <sys/ddi.h>
#include <vm/hat.h>
#include <vm/xhat.h>
@@ -879,6 +881,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
struct seg *segsav;
int as_lock_held;
klwp_t *lwp = ttolwp(curthread);
+ zone_t *zonep = curzone;
int is_xhat = 0;
int holding_wpage = 0;
extern struct seg_ops segdev_ops;
@@ -928,6 +931,23 @@ retry:
if (as == &kas)
CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
CPU_STATS_EXIT_K();
+ if (zonep->zone_pg_flt_delay != 0) {
+ /*
+ * The zone in which this process is running
+ * is currently over it's physical memory cap.
+ * Throttle page faults to help the user-land
+ * memory capper catch up. Note that
+ * drv_usectohz() rounds up.
+ */
+ atomic_add_64(&zonep->zone_pf_throttle, 1);
+ atomic_add_64(&zonep->zone_pf_throttle_usec,
+ zonep->zone_pg_flt_delay);
+ if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
+ drv_usecwait(zonep->zone_pg_flt_delay);
+ else
+ delay(drv_usectohz(
+ zonep->zone_pg_flt_delay));
+ }
break;
}
}
diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c
index 7233581227..39ace0b3c2 100644
--- a/usr/src/uts/common/vm/vm_pvn.c
+++ b/usr/src/uts/common/vm/vm_pvn.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags)
page_io_unlock(pp);
page_unlock(pp);
}
- } else if (flags & B_INVAL) {
+ } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
+ /*
+ * If B_INVALCURONLY is set, then we handle that case
+ * in the next conditional if hat_page_is_mapped()
+ * indicates that there are no additional mappings
+ * to the page.
+ */
+
/*
* XXX - Failed writes with B_INVAL set are
* not handled appropriately.
@@ -572,8 +580,9 @@ pvn_write_done(page_t *plist, int flags)
}
/*
- * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
- * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
+ * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
+ * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
+ * B_DELWRI indicates that this page is part of a kluster
* operation and is only to be considered if it doesn't involve any
* waiting here. B_TRUNC indicates that the file is being truncated
* and so no i/o needs to be done. B_FORCE indicates that the page
@@ -627,13 +636,17 @@ pvn_getdirty(page_t *pp, int flags)
* If we want to free or invalidate the page then
* we need to unload it so that anyone who wants
* it will have to take a minor fault to get it.
+ * If we are only invalidating the page for the
+ * current process, then pass in a different flag.
* Otherwise, we're just writing the page back so we
* need to sync up the hardwre and software mod bit to
* detect any future modifications. We clear the
* software mod bit when we put the page on the dirty
* list.
*/
- if (flags & (B_INVAL | B_FREE)) {
+ if (flags & B_INVALCURONLY) {
+ (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
+ } else if (flags & (B_INVAL | B_FREE)) {
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
} else {
(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
@@ -645,7 +658,7 @@ pvn_getdirty(page_t *pp, int flags)
* list after all.
*/
page_io_unlock(pp);
- if (flags & B_INVAL) {
+ if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
/*LINTED: constant in conditional context*/
VN_DISPOSE(pp, B_INVAL, 0, kcred);
} else if (flags & B_FREE) {
@@ -657,6 +670,9 @@ pvn_getdirty(page_t *pp, int flags)
* of VOP_PUTPAGE() who prefer freeing the
* page _only_ if no one else is accessing it.
* E.g. segmap_release()
+ * We also take this path for B_INVALCURONLY and
+ * let page_release call VN_DISPOSE if no one else is
+ * using the page.
*
* The above hat_ismod() check is useless because:
* (1) we may not be holding SE_EXCL lock;
@@ -681,7 +697,7 @@ pvn_getdirty(page_t *pp, int flags)
* We'll detect the fact that they used it when the
* i/o is done and avoid freeing the page.
*/
- if (flags & B_FREE)
+ if (flags & (B_FREE | B_INVALCURONLY))
page_downgrade(pp);
diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c
index d422f8d0e8..8f425e9e4f 100644
--- a/usr/src/uts/common/vm/vm_usage.c
+++ b/usr/src/uts/common/vm/vm_usage.c
@@ -25,6 +25,10 @@
*/
/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
* vm_usage
*
* This file implements the getvmusage() private system call.
@@ -114,7 +118,7 @@
* For accurate counting of map-shared and COW-shared pages.
*
* - visited private anons (refcnt > 1) for each collective.
- * (entity->vme_anon_hash)
+ * (entity->vme_anon)
* For accurate counting of COW-shared pages.
*
* The common accounting structure is the vmu_entity_t, which represents
@@ -152,6 +156,7 @@
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
#include <sys/avl.h>
#include <vm/anon.h>
#include <vm/as.h>
@@ -199,6 +204,14 @@ typedef struct vmu_object {
} vmu_object_t;
/*
+ * Node for tree of visited COW anons.
+ */
+typedef struct vmu_anon {
+ avl_node_t vma_node;
+ uintptr_t vma_addr;
+} vmu_anon_t;
+
+/*
* Entity by which to count results.
*
* The entity structure keeps the current rss/swap counts for each entity
@@ -221,7 +234,7 @@ typedef struct vmu_entity {
struct vmu_entity *vme_next_calc;
mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
- mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
+ avl_tree_t vme_anon; /* COW anons visited for entity */
vmusage_t vme_result; /* identifies entity and results */
} vmu_entity_t;
@@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2)
}
/*
+ * Comparison routine for our AVL tree of anon structures.
+ */
+static int
+vmu_anon_cmp(const void *lhs, const void *rhs)
+{
+ const vmu_anon_t *l = lhs, *r = rhs;
+
+ if (l->vma_addr == r->vma_addr)
+ return (0);
+
+ if (l->vma_addr < r->vma_addr)
+ return (-1);
+
+ return (1);
+}
+
+/*
* Save a bound on the free list.
*/
static void
@@ -363,13 +393,18 @@ static void
vmu_free_entity(mod_hash_val_t val)
{
vmu_entity_t *entity = (vmu_entity_t *)val;
+ vmu_anon_t *anon;
+ void *cookie = NULL;
if (entity->vme_vnode_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_vnode_hash);
if (entity->vme_amp_hash != NULL)
i_mod_hash_clear_nosync(entity->vme_amp_hash);
- if (entity->vme_anon_hash != NULL)
- i_mod_hash_clear_nosync(entity->vme_anon_hash);
+
+ while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
+ kmem_free(anon, sizeof (vmu_anon_t));
+
+ avl_destroy(&entity->vme_anon);
entity->vme_next = vmu_data.vmu_free_entities;
vmu_data.vmu_free_entities = entity;
@@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid)
"vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
sizeof (struct anon_map));
- if (entity->vme_anon_hash == NULL)
- entity->vme_anon_hash = mod_hash_create_ptrhash(
- "vmusage anon hash", VMUSAGE_HASH_SIZE,
- mod_hash_null_valdtor, sizeof (struct anon));
+ VERIFY(avl_first(&entity->vme_anon) == NULL);
+
+ avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
+ offsetof(struct vmu_anon, vma_node));
entity->vme_next = vmu_data.vmu_entities;
vmu_data.vmu_entities = entity;
@@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id)
zone->vmz_id = id;
- if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+ if ((vmu_data.vmu_calc_flags &
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
@@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
}
static int
-vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
+vmu_find_insert_anon(vmu_entity_t *entity, void *key)
{
- int ret;
- caddr_t val;
+ vmu_anon_t anon, *ap;
- ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t *)&val);
+ anon.vma_addr = (uintptr_t)key;
- if (ret == 0)
+ if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
return (0);
- ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
- (mod_hash_val_t)key, (mod_hash_hndl_t)0);
+ ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
+ ap->vma_addr = (uintptr_t)key;
- ASSERT(ret == 0);
+ avl_add(&entity->vme_anon, ap);
return (1);
}
@@ -937,7 +971,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1024,7 +1061,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
- page_type = VMUSAGE_BOUND_INCORE;
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
+ page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
@@ -1304,6 +1344,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
@@ -1320,8 +1366,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
* Track COW anons per entity so
* they are not double counted.
*/
- if (vmu_find_insert_anon(entity->vme_anon_hash,
- (caddr_t)ap) == 0)
+ if (vmu_find_insert_anon(entity, ap) == 0)
continue;
result->vmu_rss_all += (pgcnt << PAGESHIFT);
@@ -1461,8 +1506,9 @@ vmu_calculate_proc(proc_t *p)
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
- (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
- VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+ (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+ VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
@@ -1594,8 +1640,7 @@ vmu_free_extra()
mod_hash_destroy_hash(te->vme_vnode_hash);
if (te->vme_amp_hash != NULL)
mod_hash_destroy_hash(te->vme_amp_hash);
- if (te->vme_anon_hash != NULL)
- mod_hash_destroy_hash(te->vme_anon_hash);
+ VERIFY(avl_first(&te->vme_anon) == NULL);
kmem_free(te, sizeof (vmu_entity_t));
}
while (vmu_data.vmu_free_zones != NULL) {
@@ -1739,12 +1784,34 @@ vmu_cache_rele(vmu_cache_t *cache)
}
/*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+static void
+vmu_update_zone_rctls(vmu_cache_t *cache)
+{
+ vmusage_t *rp;
+ size_t i = 0;
+ zone_t *zp;
+
+ for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+ if (rp->vmu_type == VMUSAGE_ZONE &&
+ rp->vmu_zoneid != ALL_ZONES) {
+ if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+ zp->zone_phys_mem = rp->vmu_rss_all;
+ zone_rele(zp);
+ }
+ }
+ }
+}
+
+/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
- uint_t flags, int cpflg)
+ uint_t flags, id_t req_zone_id, int cpflg)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
@@ -1763,7 +1830,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
- if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+ if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
@@ -1826,26 +1893,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
continue;
}
- /* Skip "other zone" results if not requested */
- if (result->vmu_zoneid != curproc->p_zone->zone_id) {
- if (result->vmu_type == VMUSAGE_ZONE &&
- (flags & VMUSAGE_ALL_ZONES) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_PROJECTS &&
- (flags & (VMUSAGE_ALL_PROJECTS |
- VMUSAGE_COL_PROJECTS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_TASKS &&
- (flags & VMUSAGE_ALL_TASKS) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_RUSERS &&
- (flags & (VMUSAGE_ALL_RUSERS |
- VMUSAGE_COL_RUSERS)) == 0)
- continue;
- if (result->vmu_type == VMUSAGE_EUSERS &&
- (flags & (VMUSAGE_ALL_EUSERS |
- VMUSAGE_COL_EUSERS)) == 0)
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ flags & VMUSAGE_A_ZONE) {
+ /* Skip non-requested zone results */
+ if (result->vmu_zoneid != req_zone_id)
continue;
+ } else {
+ /* Skip "other zone" results if not requested */
+ if (result->vmu_zoneid != curproc->p_zone->zone_id) {
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ (flags & VMUSAGE_ALL_ZONES) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_PROJECTS &&
+ (flags & (VMUSAGE_ALL_PROJECTS |
+ VMUSAGE_COL_PROJECTS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_TASKS &&
+ (flags & VMUSAGE_ALL_TASKS) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_RUSERS &&
+ (flags & (VMUSAGE_ALL_RUSERS |
+ VMUSAGE_COL_RUSERS)) == 0)
+ continue;
+ if (result->vmu_type == VMUSAGE_EUSERS &&
+ (flags & (VMUSAGE_ALL_EUSERS |
+ VMUSAGE_COL_EUSERS)) == 0)
+ continue;
+ }
}
count++;
if (out_result != NULL) {
@@ -1901,10 +1975,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
+ id_t req_zone_id;
/*
* Non-global zones cannot request system wide and/or collated
- * results, or the system result, so munge the flags accordingly.
+ * results, or the system result, or usage of another zone, so munge
+ * the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
@@ -1924,6 +2000,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
+ if (flags & VMUSAGE_A_ZONE) {
+ flags &= ~VMUSAGE_A_ZONE;
+ flags |= VMUSAGE_ZONE;
+ }
}
/* Check for unknown flags */
@@ -1934,6 +2014,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
if ((flags & VMUSAGE_MASK) == 0)
return (set_errno(EINVAL));
+ /* If requesting results for a specific zone, get the zone ID */
+ if (flags & VMUSAGE_A_ZONE) {
+ size_t bufsize;
+ vmusage_t zreq;
+
+ if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+ return (set_errno(EFAULT));
+ /* Requested zone ID is passed in buf, so 0 len not allowed */
+ if (bufsize == 0)
+ return (set_errno(EINVAL));
+ if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+ return (set_errno(EFAULT));
+ req_zone_id = zreq.vmu_id;
+ }
+
mutex_enter(&vmu_data.vmu_lock);
now = gethrtime();
@@ -1953,7 +2048,7 @@ start:
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
- cpflg);
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
@@ -2009,8 +2104,11 @@ start:
mutex_exit(&vmu_data.vmu_lock);
+ /* update zone's phys. mem. rctl usage */
+ vmu_update_zone_rctls(cache);
/* copy cache */
- ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+ ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+ req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);