diff options
Diffstat (limited to 'src/pmdas/linux_proc')
-rw-r--r-- | src/pmdas/linux_proc/GNUmakefile | 89 | ||||
-rwxr-xr-x | src/pmdas/linux_proc/Install | 29 | ||||
-rwxr-xr-x | src/pmdas/linux_proc/Remove | 23 | ||||
-rw-r--r-- | src/pmdas/linux_proc/cgroups.c | 1146 | ||||
-rw-r--r-- | src/pmdas/linux_proc/cgroups.h | 74 | ||||
-rw-r--r-- | src/pmdas/linux_proc/clusters.h | 48 | ||||
-rw-r--r-- | src/pmdas/linux_proc/contexts.c | 238 | ||||
-rw-r--r-- | src/pmdas/linux_proc/contexts.h | 57 | ||||
-rw-r--r-- | src/pmdas/linux_proc/getinfo.c | 55 | ||||
-rw-r--r-- | src/pmdas/linux_proc/getinfo.h | 16 | ||||
-rw-r--r-- | src/pmdas/linux_proc/help | 220 | ||||
-rw-r--r-- | src/pmdas/linux_proc/indom.h | 52 | ||||
-rw-r--r-- | src/pmdas/linux_proc/ksym.c | 564 | ||||
-rw-r--r-- | src/pmdas/linux_proc/ksym.h | 41 | ||||
-rw-r--r-- | src/pmdas/linux_proc/linux_proc_migrate.conf | 55 | ||||
-rw-r--r-- | src/pmdas/linux_proc/pmda.c | 1896 | ||||
-rw-r--r-- | src/pmdas/linux_proc/proc_pid.c | 957 | ||||
-rw-r--r-- | src/pmdas/linux_proc/proc_pid.h | 289 | ||||
-rw-r--r-- | src/pmdas/linux_proc/proc_runq.c | 123 | ||||
-rw-r--r-- | src/pmdas/linux_proc/proc_runq.h | 35 | ||||
-rw-r--r-- | src/pmdas/linux_proc/root | 6 | ||||
-rw-r--r-- | src/pmdas/linux_proc/root_proc | 181 |
22 files changed, 6194 insertions, 0 deletions
diff --git a/src/pmdas/linux_proc/GNUmakefile b/src/pmdas/linux_proc/GNUmakefile new file mode 100644 index 0000000..97dc518 --- /dev/null +++ b/src/pmdas/linux_proc/GNUmakefile @@ -0,0 +1,89 @@ +# +# Copyright (c) 2000,2003,2004,2008 Silicon Graphics, Inc. All Rights Reserved. +# Copyright (c) 2007-2010 Aconex. All Rights Reserved. +# Copyright (c) 2013-2014 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +TOPDIR = ../../.. +include $(TOPDIR)/src/include/builddefs + +IAM = proc +DOMAIN = PROC +CMDTARGET = pmdaproc +LIBTARGET = pmda_proc.so +PMDAINIT = proc_init +PMDADIR = $(PCP_PMDAS_DIR)/$(IAM) +CONF_LINE = "proc 3 pipe binary $(PMDADIR)/$(CMDTARGET) -d 3" + +CFILES = pmda.c \ + cgroups.c proc_pid.c proc_runq.c ksym.c getinfo.c contexts.c + +HFILES = clusters.h indom.h \ + cgroups.h proc_pid.h proc_runq.h ksym.h getinfo.h contexts.h + +SCRIPTS = Install Remove +VERSION_SCRIPT = exports +HELPTARGETS = help.dir help.pag +LSRCFILES = help root root_proc linux_proc_migrate.conf $(SCRIPTS) +LDIRT = $(HELPTARGETS) domain.h $(VERSION_SCRIPT) + +LLDLIBS = $(PCP_PMDALIB) +LCFLAGS = $(INVISIBILITY) + +# Uncomment these flags for profiling +# LCFLAGS += -pg +# LLDFLAGS += -pg + +default: build-me + +include $(BUILDRULES) + +ifeq "$(TARGET_OS)" "linux" +build-me: domain.h $(LIBTARGET) $(CMDTARGET) $(HELPTARGETS) + @if [ `grep -c $(CONF_LINE) ../pmcd.conf` -eq 0 ]; then \ + echo $(CONF_LINE) >> ../pmcd.conf ; \ + fi + +install: default + $(INSTALL) -m 755 -d $(PMDADIR) + $(INSTALL) -m 644 domain.h help help.dir help.pag root root_proc $(PMDADIR) + $(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(SCRIPTS) $(PMDADIR) + $(INSTALL) -m 644 root_proc $(PCP_VAR_DIR)/pmns/root_proc + $(INSTALL) -m 644 linux_proc_migrate.conf $(PCP_VAR_DIR)/config/pmlogrewrite/linux_proc_migrate.conf +else +build-me: +install: +endif + +default_pcp : default + +install_pcp : install + +$(HELPTARGETS) : help + $(RUN_IN_BUILD_ENV) $(TOPDIR)/src/newhelp/newhelp -n root_proc -v 2 -o help < help + +$(VERSION_SCRIPT): + $(VERSION_SCRIPT_MAKERULE) + +domain.h: ../../pmns/stdpmid + $(DOMAIN_MAKERULE) + +cgroups.o pmda.o: clusters.h +cgroups.o pmda.o: cgroups.h +cgroups.o pmda.o proc_pid.o proc_runq.o: proc_pid.h +pmda.o proc_runq.o: proc_runq.h +indom.o pmda.o: indom.h +ksym.o pmda.o: ksym.h +pmda.o: domain.h +pmda.o: getinfo.h +pmda.o: $(VERSION_SCRIPT) diff --git a/src/pmdas/linux_proc/Install b/src/pmdas/linux_proc/Install new file mode 100755 index 0000000..74fa225 --- /dev/null +++ b/src/pmdas/linux_proc/Install @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Copyright (c) 2013 Red Hat Inc. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Install the Linux per-process (proc) PMDA and/or PMNS +# + +. $PCP_DIR/etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +iam=proc +pmda_interface=6 +daemon_opt=true +pipe_opt=true +pmns_source=root_proc + +pmdaSetup +pmdaInstall +exit 0 diff --git a/src/pmdas/linux_proc/Remove b/src/pmdas/linux_proc/Remove new file mode 100755 index 0000000..4befc73 --- /dev/null +++ b/src/pmdas/linux_proc/Remove @@ -0,0 +1,23 @@ +#!/bin/sh +# +# Copyright (c) 2013 Red Hat Inc. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Remove the Linux per-process (proc) PMDA +# + +. $PCP_DIR/etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh +iam=proc +pmdaSetup +pmdaRemove +exit 0 diff --git a/src/pmdas/linux_proc/cgroups.c b/src/pmdas/linux_proc/cgroups.c new file mode 100644 index 0000000..4994465 --- /dev/null +++ b/src/pmdas/linux_proc/cgroups.c @@ -0,0 +1,1146 @@ +/* + * Copyright (c) 2012-2014 Red Hat. + * Copyright (c) 2010 Aconex. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include "indom.h" +#include "cgroups.h" +#include "clusters.h" +#include "proc_pid.h" +#include <sys/stat.h> +#include <ctype.h> + +#define CGROUP_ROOT "cgroup.groups" /* root dynamic PMNS node */ + +/* Add namespace entries and prepare values for one cgroupfs directory entry */ +struct cgroup_subsys; +typedef int (*cgroup_prepare_t)(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); +static int prepare_ull(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); +static int prepare_string(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); +static int prepare_named_ull(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); +static int prepare_block_ull(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); +static int prepare_blocks_ull(__pmnsTree *, const char *, + struct cgroup_subsys *, const char *, int, int, int); + +/* + * Critical data structures for cgroup subsystem in pmdaproc ... + * Initial comment for each struct talks about lifecycle of that + * data, in terms of what pmdaproc must do with it (esp. memory + * allocation related). + */ + +typedef struct { /* contents depends on individual kernel cgroups */ + int item; /* PMID == domain:cluster:[id:item] */ + int dynamic; /* do we need an extra free (string) */ + cgroup_prepare_t prepare; /* setup metric name(s) and value(s) */ + char *suffix; /* cpus/mems/rss/... */ +} cgroup_metrics_t; + +typedef struct { /* some metrics are multi-valued, but most have only one */ + int item; /* PMID == domain:cluster:[id:item] */ + int atom_count; + pmAtomValue *atoms; +} cgroup_values_t; + +typedef struct { /* contains data for each group users have created, if any */ + int id; /* PMID == domain:cluster:[id:item] */ + int refreshed; /* boolean: are values all uptodate */ + proc_pid_list_t process_list; + cgroup_values_t *metric_values; +} cgroup_group_t; + +typedef struct cgroup_subsys { /* contents covers the known kernel cgroups */ + const char *name; /* cpuset/memory/... */ + int cluster; /* PMID == domain:cluster:[id:item] */ + int group_count; /* number of groups (dynamic) */ + int metric_count; /* number of metrics (fixed) */ + time_t previous_time; /* used to avoid repeated refresh */ + cgroup_group_t *groups; /* array of groups (dynamic) */ + cgroup_metrics_t *metrics; /* array of metrics (fixed) */ +} cgroup_subsys_t; + +static cgroup_metrics_t cpusched_metrics[] = { + { .suffix = "shares", .prepare = prepare_ull }, +}; + +static cgroup_metrics_t cpuacct_metrics[] = { + { .suffix = "stat.user", .prepare = prepare_named_ull }, + { .suffix = "stat.system", .prepare = prepare_named_ull }, + { .suffix = "usage", .prepare = prepare_ull }, + { .suffix = "usage_percpu", .prepare = prepare_ull }, +}; + +static cgroup_metrics_t cpuset_metrics[] = { + { .suffix = "io_merged", .prepare = prepare_string }, + { .suffix = "sectors", .prepare = prepare_string }, +}; + +static cgroup_metrics_t memory_metrics[] = { + { .suffix = "stat.cache", .prepare = prepare_named_ull }, + { .suffix = "stat.rss", .prepare = prepare_named_ull }, + { .suffix = "stat.rss_huge", .prepare = prepare_named_ull }, + { .suffix = "stat.mapped_file", .prepare = prepare_named_ull }, + { .suffix = "stat.writeback", .prepare = prepare_named_ull }, + { .suffix = "stat.swap", .prepare = prepare_named_ull }, + { .suffix = "stat.pgpgin", .prepare = prepare_named_ull }, + { .suffix = "stat.pgpgout", .prepare = prepare_named_ull }, + { .suffix = "stat.pgfault", .prepare = prepare_named_ull }, + { .suffix = "stat.pgmajfault", .prepare = prepare_named_ull }, + { .suffix = "stat.inactive_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.active_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.inactive_file", .prepare = prepare_named_ull }, + { .suffix = "stat.active_file", .prepare = prepare_named_ull }, + { .suffix = "stat.unevictable", .prepare = prepare_named_ull }, + { .suffix = "stat.total_cache", .prepare = prepare_named_ull }, + { .suffix = "stat.total_rss", .prepare = prepare_named_ull }, + { .suffix = "stat.total_rss_huge", .prepare = prepare_named_ull }, + { .suffix = "stat.total_mapped_file", .prepare = prepare_named_ull }, + { .suffix = "stat.total_writeback", .prepare = prepare_named_ull }, + { .suffix = "stat.total_swap", .prepare = prepare_named_ull }, + { .suffix = "stat.total_pgpgin", .prepare = prepare_named_ull }, + { .suffix = "stat.total_pgpgout", .prepare = prepare_named_ull }, + { .suffix = "stat.total_pgfault", .prepare = prepare_named_ull }, + { .suffix = "stat.total_pgmajfault", .prepare = prepare_named_ull }, + { .suffix = "stat.total_inactive_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.total_active_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.total_inactive_file", .prepare = prepare_named_ull }, + { .suffix = "stat.total_active_file", .prepare = prepare_named_ull }, + { .suffix = "stat.total_unevictable", .prepare = prepare_named_ull }, + { .suffix = "stat.recent_rotated_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.recent_rotated_file", .prepare = prepare_named_ull }, + { .suffix = "stat.recent_scanned_anon", .prepare = prepare_named_ull }, + { .suffix = "stat.recent_scanned_file", .prepare = prepare_named_ull }, +}; + +static cgroup_metrics_t netclass_metrics[] = { + { .suffix = "classid", .prepare = prepare_ull }, +}; + +static cgroup_metrics_t blkio_metrics[] = { + { .suffix = "io_merged.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_merged.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_merged.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_merged.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_merged.total", .prepare = prepare_blocks_ull }, + { .suffix = "io_queued.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_queued.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_queued.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_queued.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_queued.total", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_bytes.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_bytes.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_bytes.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_bytes.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_bytes.total", .prepare = prepare_blocks_ull }, + { .suffix = "io_serviced.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_serviced.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_serviced.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_serviced.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_serviced.total", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_time.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_time.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_time.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_time.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_service_time.total", .prepare = prepare_blocks_ull }, + { .suffix = "io_wait_time.read", .prepare = prepare_blocks_ull }, + { .suffix = "io_wait_time.write", .prepare = prepare_blocks_ull }, + { .suffix = "io_wait_time.sync", .prepare = prepare_blocks_ull }, + { .suffix = "io_wait_time.async", .prepare = prepare_blocks_ull }, + { .suffix = "io_wait_time.total", .prepare = prepare_blocks_ull }, + { .suffix = "sectors", .prepare = prepare_block_ull }, + { .suffix = "time", .prepare = prepare_block_ull }, +}; + +static const char *block_stats_names[] = \ + { "read", "write", "sync", "async", "total" }; +#define BLKIOS (sizeof(block_stats_names)/sizeof(block_stats_names[0])) + +static cgroup_subsys_t controllers[] = { + { .name = "cpu", + .cluster = CLUSTER_CPUSCHED_GROUPS, + .metrics = cpusched_metrics, + .metric_count = sizeof(cpusched_metrics) / sizeof(cgroup_metrics_t), + }, + { .name = "cpuset", + .cluster = CLUSTER_CPUSET_GROUPS, + .metrics = cpuset_metrics, + .metric_count = sizeof(cpuset_metrics) / sizeof(cgroup_metrics_t), + }, + { .name = "cpuacct", + .cluster = CLUSTER_CPUACCT_GROUPS, + .metrics = cpuacct_metrics, + .metric_count = sizeof(cpuacct_metrics) / sizeof(cgroup_metrics_t), + }, + { .name = "memory", + .cluster = CLUSTER_MEMORY_GROUPS, + .metrics = memory_metrics, + .metric_count = sizeof(memory_metrics) / sizeof(cgroup_metrics_t), + }, + { .name = "net_cls", + .cluster = CLUSTER_NET_CLS_GROUPS, + .metrics = netclass_metrics, + .metric_count = sizeof(netclass_metrics) / sizeof(cgroup_metrics_t), + }, + { .name = "blkio", + .cluster = CLUSTER_BLKIO_GROUPS, + .metrics = blkio_metrics, + .metric_count = sizeof(blkio_metrics) / sizeof(cgroup_metrics_t), + }, +}; + +/* + * Data structures used by individual cgroup subsystem controllers + */ +typedef struct { + __uint32_t major; + __uint32_t minor; + int inst; + char *name; +} device_t; + +typedef struct { + device_t dev; + __uint64_t values[BLKIOS]; /* read, write, sync, async, total */ +} block_stats_t; + +typedef struct filesys { + int id; + char *device; + char *path; + char *options; +} filesys_t; + +void +refresh_cgroup_cpus(pmInDom indom) +{ + char buf[MAXPATHLEN]; + char *space; + FILE *fp; + + pmdaCacheOp(indom, PMDA_CACHE_INACTIVE); + if ((fp = proc_statsfile("/proc/stat", buf, sizeof(buf))) == NULL) + return; + while (fgets(buf, sizeof(buf), fp) != NULL) { + if (strncmp(buf, "cpu", 3) == 0 && isdigit((int)buf[3])) { + if ((space = strchr(buf, ' ')) != NULL) { + *space = '\0'; + pmdaCacheStore(indom, PMDA_CACHE_ADD, buf, NULL); + } + } + } + fclose(fp); +} + +static int +_pm_isloop(char *dname) +{ + return strncmp(dname, "loop", 4) == 0; +} + +static int +_pm_isramdisk(char *dname) +{ + return strncmp(dname, "ram", 3) == 0; +} + +/* + * For block devices we have one instance domain for dev_t + * based lookup, and another for (real) name lookup. + * The reason we need this is that the blkio cgroup stats + * are exported using the major:minor numbers, and not the + * device names - we must perform that mapping ourselves. + * In some places (value refresh) we need to lookup the blk + * name from device major/minor, in other places (instances + * refresh) we need the usual external instid:name lookup. + */ +void +refresh_cgroup_devices(pmInDom diskindom) +{ + pmInDom devtindom = INDOM(DEVT_INDOM); + char buf[MAXPATHLEN]; + static time_t before; + time_t now; + FILE *fp; + + if ((now = time(NULL)) == before) + return; + before = now; + + pmdaCacheOp(devtindom, PMDA_CACHE_INACTIVE); + pmdaCacheOp(diskindom, PMDA_CACHE_INACTIVE); + + if ((fp = proc_statsfile("/proc/diskstats", buf, sizeof(buf))) == NULL) + return; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + unsigned int major, minor, unused; + device_t *dev = NULL; + char namebuf[1024]; + int inst; + + if (sscanf(buf, "%u %u %s %u", &major, &minor, namebuf, &unused) != 4) + continue; + if (_pm_isloop(namebuf) || _pm_isramdisk(namebuf)) + continue; + if (pmdaCacheLookupName(diskindom, namebuf, &inst, (void **)&dev) < 0 || + dev == NULL) { + if (!(dev = (device_t *)malloc(sizeof(device_t)))) { + __pmNoMem("device", sizeof(device_t), PM_RECOV_ERR); + continue; + } + dev->major = major; + dev->minor = minor; + } + /* keeping track of all fields (major/minor/inst/name) */ + pmdaCacheStore(diskindom, PMDA_CACHE_ADD, namebuf, dev); + pmdaCacheLookupName(diskindom, namebuf, &dev->inst, NULL); + pmdaCacheLookup(diskindom, dev->inst, &dev->name, NULL); + + snprintf(buf, sizeof(buf), "%u:%u", major, minor); + pmdaCacheStore(devtindom, PMDA_CACHE_ADD, buf, (void *)dev); + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "refresh_devices: \"%s\" \"%d:%d\" inst=%d\n", + dev->name, dev->major, dev->minor, dev->inst); + } + fclose(fp); +} + +void +refresh_cgroup_subsys(pmInDom indom) +{ + char buf[4096]; + static time_t before; + time_t now; + FILE *fp; + + if ((now = time(NULL)) == before) + return; + before = now; + + if ((fp = proc_statsfile("/proc/cgroups", buf, sizeof(buf))) == NULL) + return; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + unsigned int numcgroups, enabled; + char name[MAXPATHLEN]; + long hierarchy; + long *data; + int sts; + + /* skip lines starting with hash (header) */ + if (buf[0] == '#') + continue; + if (sscanf(buf, "%s %ld %u %u", &name[0], + &hierarchy, &numcgroups, &enabled) != 4) + continue; + sts = pmdaCacheLookupName(indom, name, NULL, (void **)&data); + if (sts == PMDA_CACHE_ACTIVE) { + if (*data != hierarchy) { + /* + * odd ... instance name repeated but different + * hierarchy ... we cannot support more than one hierarchy + * yet + */ + fprintf(stderr, "refresh_cgroup_subsys: \"%s\": entries for hierarchy %ld ignored (hierarchy %ld seen first)\n", name, hierarchy, *data); + } + continue; + } + else if (sts != PMDA_CACHE_INACTIVE) { + if ((data = (long *)malloc(sizeof(long))) == NULL) { +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "refresh_cgroup_subsys: \"%s\": malloc failed\n", name); +#endif + continue; + } + *data = hierarchy; + } + pmdaCacheStore(indom, PMDA_CACHE_ADD, name, (void *)data); +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "refresh_cgroup_subsys: add \"%s\" [hierarchy %ld]\n", name, hierarchy); +#endif + } + fclose(fp); +} + +void +refresh_cgroup_filesys(pmInDom indom) +{ + char buf[MAXPATHLEN]; + filesys_t *fs; + FILE *fp; + time_t now; + static time_t before; + char *path, *device, *type, *options; + int sts; + + if ((now = time(NULL)) == before) + return; + before = now; + + pmdaCacheOp(indom, PMDA_CACHE_INACTIVE); + + if ((fp = proc_statsfile("/proc/mounts", buf, sizeof(buf))) == NULL) + return; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + device = strtok(buf, " "); + path = strtok(NULL, " "); + type = strtok(NULL, " "); + options = strtok(NULL, " "); + if (strcmp(type, "cgroup") != 0) + continue; + + sts = pmdaCacheLookupName(indom, path, NULL, (void **)&fs); + if (sts == PMDA_CACHE_ACTIVE) /* repeated line in /proc/mounts? */ + continue; + if (sts == PMDA_CACHE_INACTIVE) { /* re-activate an old mount */ + pmdaCacheStore(indom, PMDA_CACHE_ADD, path, fs); + if (strcmp(path, fs->path) != 0) { /* old device, new path */ + free(fs->path); + fs->path = strdup(path); + } + if (strcmp(options, fs->options) != 0) { /* old device, new opts */ + free(fs->options); + fs->options = strdup(options); + } + } + else { /* new mount */ + if ((fs = malloc(sizeof(filesys_t))) == NULL) + continue; + fs->path = strdup(path); + fs->options = strdup(options); + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "refresh_filesys: add \"%s\" \"%s\"\n", + fs->path, device); + pmdaCacheStore(indom, PMDA_CACHE_ADD, path, fs); + } + } + fclose(fp); +} + +static char * +scan_filesys_options(const char *options, const char *option) +{ + static char buffer[128]; + char *s; + + strncpy(buffer, options, sizeof(buffer)); + buffer[sizeof(buffer)-1] = '\0'; + + s = strtok(buffer, ","); + while (s) { + if (strcmp(s, option) == 0) + return s; + s = strtok(NULL, ","); + } + return NULL; +} + +static int +read_values(char *buffer, int size, const char *path, const char *subsys, + const char *metric) +{ + int fd, count; + + snprintf(buffer, size, "%s/%s.%s", path, subsys, metric); + if ((fd = open(buffer, O_RDONLY)) < 0) + return -oserror(); + count = read(fd, buffer, size); + close(fd); + if (count < 0) + return -oserror(); + buffer[count-1] = '\0'; + return 0; +} + +static pmID +update_pmns(__pmnsTree *pmns, cgroup_subsys_t *subsys, const char *name, + cgroup_metrics_t *metrics, int group, int domain) +{ + char entry[MAXPATHLEN]; + pmID pmid; + + snprintf(entry, sizeof(entry), "%s.%s%s.%s", + CGROUP_ROOT, subsys->name, name, metrics->suffix); + pmid = cgroup_pmid_build(domain, subsys->cluster, group, metrics->item); + __pmAddPMNSNode(pmns, pmid, entry); + return pmid; +} + +static int +prepare_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain) +{ + int count = 0; + unsigned long long value; + char buffer[MAXPATHLEN]; + char *endp, *p = &buffer[0]; + cgroup_group_t *groups = &subsys->groups[group]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + pmAtomValue *atoms = groups->metric_values[metric].atoms; + + if (read_values(p, sizeof(buffer), path, subsys->name, metrics->suffix) < 0) + return -oserror(); + + while (p && *p) { + value = strtoull(p, &endp, 0); + if ((atoms = realloc(atoms, (count + 1) * sizeof(pmAtomValue))) == NULL) + return -oserror(); + atoms[count++].ull = value; + if (endp == '\0' || endp == p) + break; + p = endp; + while (p && isspace((int)*p)) + p++; + } + + groups->metric_values[metric].item = metric; + groups->metric_values[metric].atoms = atoms; + groups->metric_values[metric].atom_count = count; + update_pmns(pmns, subsys, name, metrics, group, domain); + return 0; +} + +static int +prepare_named_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain) +{ + int i, count; + unsigned long long value; + char filename[64], buffer[MAXPATHLEN]; + char *offset, *p = &buffer[0]; + cgroup_group_t *groups = &subsys->groups[group]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + + /* metric => e.g. stat.user and stat.system - split it up first */ + offset = index(metrics->suffix, '.'); + if (!offset) + return PM_ERR_CONV; + count = (offset - metrics->suffix); + strncpy(filename, metrics->suffix, count); + filename[count] = '\0'; + + if (read_values(p, sizeof(buffer), path, subsys->name, filename) < 0) + return -oserror(); + + /* buffer contains <name> <value> pairs */ + while (p && *p) { + char *endp, *field, *offset; + + if ((field = index(p, ' ')) == NULL) + return PM_ERR_CONV; + offset = field + 1; + *field = '\0'; + field = p; /* field now points to <name> */ + p = offset; + value = strtoull(p, &endp, 0); + p = endp; + while (p && isspace((int)*p)) + p++; + + for (i = 0; i < subsys->metric_count; i++) { + pmAtomValue *atoms = groups->metric_values[i].atoms; + metrics = &subsys->metrics[i]; + + if (strcmp(field, metrics->suffix + count + 1) != 0) + continue; + if ((atoms = groups->metric_values[i].atoms) == NULL) + if ((atoms = calloc(1, sizeof(pmAtomValue))) == NULL) + return -oserror(); + atoms[0].ull = value; + + groups->metric_values[i].item = i; + groups->metric_values[i].atoms = atoms; + groups->metric_values[i].atom_count = 1; + update_pmns(pmns, subsys, name, metrics, group, domain); + break; + } + } + return 0; +} + +static int +prepare_block(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain, + block_stats_t *stats, int value_count) +{ + pmID pmid; + char *iname; + char buf[MAXPATHLEN]; + device_t *dev; + pmAtomValue *atoms; + int count, size, inst, sts, m, i, j; + pmInDom devtindom = INDOM(DEVT_INDOM); + cgroup_group_t *groups = &subsys->groups[group]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + + /* map major:minor to real device name via diskstats */ + dev = &stats->dev; + snprintf(buf, sizeof(buf), "%u:%u", dev->major, dev->minor); + + sts = pmdaCacheLookupName(devtindom, buf, NULL, (void **)&dev); + iname = dev->name; + inst = dev->inst; + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "prepare_block: preparing %s found=%s (%s)\n", + buf, sts == PMDA_CACHE_ACTIVE ? "ok" : "no", iname); + + /* batch update metric value(s) now, since we have 'em all */ + for (j = 0; j < value_count; j++) { + m = metric + j; + atoms = groups->metric_values[m].atoms; + count = groups->metric_values[m].atom_count; + + if (inst >= count) { + size = (inst + 1) * sizeof(pmAtomValue); + if ((atoms = realloc(atoms, size)) == NULL) + return -oserror(); + for (i = count; i < inst + 1; i++) + atoms[i].ull = ULLONG_MAX; + count = inst + 1; + } + /* move on-stack value into global struct, add to PMNS */ + atoms[inst].ull = stats->values[j]; + pmid = update_pmns(pmns, subsys, name, metrics + j, group, domain); + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "prepare_block: prepared " + "metric=%s inst=%s[%d] value=%llu\n", + pmIDStr(pmid), iname, inst, + (unsigned long long)atoms[inst].ull); + + groups->metric_values[m].item = m; + groups->metric_values[m].atoms = atoms; + groups->metric_values[m].atom_count = count; + } + return 0; +} + +static int +prepare_block_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain) +{ + char buf[MAXPATHLEN]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + block_stats_t stats; + FILE *fp; + char *p; + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "prepare_block_ull: %s metric=%d group=%d domain=%d\n", + path, metric, group, domain); + + snprintf(buf, sizeof(buf), "%s/%s.%s", path, subsys->name, metrics->suffix); + if ((fp = fopen(buf, "r")) == NULL) + return -oserror(); + + memset(&stats, 0, sizeof(stats)); + while ((fgets(buf, sizeof(buf), fp)) != NULL) { + if (sscanf(buf, "%u:%u ", &stats.dev.major, &stats.dev.minor) != 2) + continue; + for (p = buf; *p && !isspace(*p); p++) { } /* skip device number */ + for (p = buf; *p && isspace(*p); p++) { } /* skip over spaces */ + if (sscanf(p, "%llu", (unsigned long long *)&stats.values[0]) != 1) + stats.values[0] = 0; + prepare_block(pmns, path, subsys, name, + metric, group, domain, &stats, 1); + } + fclose(fp); + return 0; +} + +static int +prepare_blocks_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain) +{ + char buf[MAXPATHLEN]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + block_stats_t stats; + FILE *fp; + char *p; + int j; + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "prepare_blocks_ull: %s metric=%d group=%d domain=%d\n", + path, metric, group, domain); + + if (metric % BLKIOS != 0) + return 0; + + snprintf(buf, sizeof(buf), "%s/%s.%s", path, subsys->name, metrics->suffix); + buf[strlen(buf) - sizeof("read")] = '\0'; + + if (pmDebug & DBG_TRACE_APPL2) + fprintf(stderr, "prepare_blocks_ull: opening \"%s\"\n", buf); + + if ((fp = fopen(buf, "r")) == NULL) + return -oserror(); + + memset(&stats, 0, sizeof(stats)); + while ((fgets(buf, sizeof(buf), fp)) != NULL) { + if (sscanf(buf, "%u:%u ", &stats.dev.major, &stats.dev.minor) != 2) + continue; + + /* iterate over read/write/sync/async/total (reverse for async) */ + for (j = BLKIOS-1; j >= 0; j--) { + if ((p = strcasestr(buf, block_stats_names[j])) == NULL) + continue; + p += strlen(block_stats_names[j]) + 1; + if (sscanf(p, "%llu", (unsigned long long *)&stats.values[j]) != 1) + stats.values[j] = 0; + break; + } + + if (j == BLKIOS - 1) { /* Total: last one, update incore structures */ + prepare_block(pmns, path, subsys, name, + metric, group, domain, &stats, BLKIOS); + /* reset on-stack structure for next outer loop iteration */ + memset(&stats, 0, sizeof(stats)); + } + } + fclose(fp); + return 0; +} + +static int +prepare_string(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys, + const char *name, int metric, int group, int domain) +{ + char buffer[MAXPATHLEN]; + cgroup_group_t *groups = &subsys->groups[group]; + cgroup_metrics_t *metrics = &subsys->metrics[metric]; + pmAtomValue *atoms = groups->metric_values[metric].atoms; + char *p = &buffer[0]; + + if (read_values(p, sizeof(buffer), path, subsys->name, metrics->suffix) < 0) + return -oserror(); + + if ((atoms = malloc(sizeof(pmAtomValue))) == NULL) + return -oserror(); + if ((atoms[0].cp = strdup(buffer)) == NULL) { + free(atoms); + return -oserror(); + } + groups->metric_values[metric].item = metric; + groups->metric_values[metric].atoms = atoms; + groups->metric_values[metric].atom_count = 1; + update_pmns(pmns, subsys, name, metrics, group, domain); + return 0; +} + +static void +translate(char *dest, const char *src, size_t size) +{ + char *p; + + if (*src != '\0') /* non-root */ + *dest = '.'; + strncpy(dest, src, size); + for (p = dest; *p; p++) { + if (*p == '/') + *p = '.'; + } +} + +static int +namespace(__pmnsTree *pmns, cgroup_subsys_t *subsys, + const char *cgrouppath, const char *cgroupname, int domain) +{ + int i, id; + size_t size; + cgroup_values_t *cvp; + char group[128]; + + translate(&group[0], cgroupname, sizeof(group)); + + /* allocate space for this group */ + size = (subsys->group_count + 1) * sizeof(cgroup_group_t); + subsys->groups = (cgroup_group_t *)realloc(subsys->groups, size); + if (subsys->groups == NULL) + return -oserror(); + + /* allocate space for all values up-front */ + size = subsys->metric_count; + cvp = (cgroup_values_t *)calloc(size, sizeof(cgroup_values_t)); + if (cvp == NULL) + return -oserror(); + + id = subsys->group_count++; + memset(&subsys->groups[id], 0, sizeof(cgroup_group_t)); + subsys->groups[id].id = id; + subsys->groups[id].metric_values = cvp; + + for (i = 0; i < size; i++) { + cgroup_metrics_t *metrics = &subsys->metrics[i]; + metrics->prepare(pmns, cgrouppath, subsys, group, i, id, domain); + } + return 1; +} + +char * +cgroup_find_subsys(pmInDom indom, void *data) +{ + static char dunno[] = "?"; + static char opts[256]; + char buffer[256]; + char *s, *out = NULL; + filesys_t *fs = (filesys_t *)data; + + memset(opts, 0, sizeof(opts)); + strncpy(buffer, fs->options, sizeof(buffer)); + + s = strtok(buffer, ","); + while (s) { + if (pmdaCacheLookupName(indom, s, NULL, NULL) == PMDA_CACHE_ACTIVE) { + if (out) { /* append option */ + strcat(out, ","); + strcat(out, s); + out += strlen(s) + 1; /* +1 => cater for comma */ + } else { /* first option */ + strcat(opts, s); + out = opts + strlen(s); + } + } + s = strtok(NULL, ","); + } + if (out) + return opts; + return dunno; +} + +/* Ensure cgroup name can be used as a PCP namespace entry, ignore it if not */ +static int +valid_pmns_name(char *name) +{ + if (!isalpha((int)name[0])) + return 0; + for (; *name != '\0'; name++) + if (!isalnum((int)*name) && *name != '_') + return 0; + return 1; +} + +static int +cgroup_scan(const char *mnt, const char *path, cgroup_subsys_t *subsys, + int domain, __pmnsTree *pmns, int root) +{ + int sts, length; + DIR *dirp; + struct stat sbuf; + struct dirent *dp; + char *cgroupname; + char cgrouppath[MAXPATHLEN]; + + if (root) { + snprintf(cgrouppath, sizeof(cgrouppath), "%s%s", proc_statspath, mnt); + length = strlen(cgrouppath); + } else { + snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s", proc_statspath, mnt, path); + length = strlen(proc_statspath) + strlen(mnt) + 1; + } + + if ((dirp = opendir(cgrouppath)) == NULL) + return -oserror(); + + cgroupname = &cgrouppath[length]; + sts = namespace(pmns, subsys, cgrouppath, cgroupname, domain); + + /* + * readdir - descend into directories to find all cgroups, then + * populate namespace with <controller>[.<groupname>].<metrics> + */ + while ((dp = readdir(dirp)) != NULL) { + int lsts; + if (!valid_pmns_name(dp->d_name)) + continue; + if (path[0] == '\0') + snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s", + proc_statspath, mnt, dp->d_name); + else + snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s/%s", + proc_statspath, mnt, path, dp->d_name); + cgroupname = &cgrouppath[length]; + if (stat(cgrouppath, &sbuf) < 0) + continue; + if (!(S_ISDIR(sbuf.st_mode))) + continue; + + lsts = namespace(pmns, subsys, cgrouppath, cgroupname, domain); + if (lsts > 0) + sts = 1; + + /* + * also scan for any child cgroups, but cgroup_scan() may return + * an error + */ + lsts = cgroup_scan(mnt, cgroupname, subsys, domain, pmns, 0); + if (lsts > 0) + sts = 1; + } + closedir(dirp); + return sts; +} + +static void +reset_subsys_stats(cgroup_subsys_t *subsys) +{ + int g, k, a; + + for (g = 0; g < subsys->group_count; g++) { + cgroup_group_t *group = &subsys->groups[g]; + for (k = 0; k < subsys->metric_count; k++) { + pmAtomValue *atoms = group->metric_values[k].atoms; + if (subsys->metrics[k].dynamic) + for (a = 0; a < group->metric_values[k].atom_count; a++) + free(atoms[a].cp); + free(atoms); + } + free(group->metric_values); + if (group->process_list.size) + free(group->process_list.pids); + memset(group, 0, sizeof(cgroup_group_t)); + } + subsys->group_count = 0; +} + +int +refresh_cgroups(pmdaExt *pmda, __pmnsTree **pmns) +{ + int i, sts, mtab = 0; + int domain = pmda->e_domain; + filesys_t *fs; + time_t now; + static time_t before; + static __pmnsTree *beforetree; + __pmnsTree *tree = pmns ? *pmns : NULL; + pmInDom mounts = INDOM(CGROUP_MOUNTS_INDOM); + pmInDom devices = INDOM(DISK_INDOM); + + now = time(NULL); + if (tree) { + if (now == before) { + *pmns = beforetree; + return 0; + } + } else if (now == before) + return 0; + + refresh_cgroup_filesys(mounts); + refresh_cgroup_devices(devices); + + if (tree) + __pmFreePMNS(tree); + + if ((sts = __pmNewPMNS(&tree)) < 0) { + __pmNotifyErr(LOG_ERR, "%s: failed to create new pmns: %s\n", + pmProgname, pmErrStr(sts)); + return 0; + } + + for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) { + cgroup_subsys_t *subsys = &controllers[i]; + + /* + * Fetch latest state for subsystem and groups of the given clusters, + * by walking the cgroup mounts, finding the mounts of this subsystem + * type, and descending into all of the groups (subdirs) + */ + reset_subsys_stats(subsys); + + pmdaCacheOp(mounts, PMDA_CACHE_WALK_REWIND); + while ((sts = pmdaCacheOp(mounts, PMDA_CACHE_WALK_NEXT)) != -1) { + if (!pmdaCacheLookup(mounts, sts, NULL, (void **)&fs)) + continue; + if (scan_filesys_options(fs->options, subsys->name) == NULL) + continue; + sts = cgroup_scan(fs->path, "", subsys, domain, tree, 1); + if (sts > 0) + mtab = 1; + } + } + + if (pmns) { + *pmns = tree; + beforetree = tree; + before = now; + } else + __pmFreePMNS(tree); + + return mtab; +} + +/* + * Shared fetch callback for all cgroups metrics + */ +int +cgroup_group_fetch(pmID pmid, unsigned int inst, pmAtomValue *atom) +{ + int i, j, k; + int gid, cluster, metric; + + gid = cgroup_pmid_group(pmid); + metric = cgroup_pmid_metric(pmid); + cluster = proc_pmid_cluster(pmid); + + for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) { + cgroup_subsys_t *subsys = &controllers[i]; + + if (subsys->cluster != cluster) + continue; + for (j = 0; j < subsys->group_count; j++) { + cgroup_group_t *group = &subsys->groups[j]; + + if (group->id != gid) + continue; + for (k = 0; k < subsys->metric_count; k++) { + cgroup_values_t *cvp = &group->metric_values[k]; + + if (cvp->item != metric) + continue; + else if (cvp->atom_count <= 0) + return PM_ERR_VALUE; + else if (inst == PM_IN_NULL) + inst = 0; + else if (inst >= cvp->atom_count) + return PM_ERR_INST; + else if (cvp->atoms[inst].ull == ULLONG_MAX) + return PM_ERR_INST; + *atom = cvp->atoms[inst]; + return 1; + } + } + } + return PM_ERR_PMID; +} + +/* + * Needs to answer the question: how much extra space needs to be allocated + * in the metric table for (dynamic) cgroup metrics"? We have static entries + * for group ID zero - if we have any non-zero group IDs, we need entries to + * cover those. Return value is the number of additional entries needed. + */ +static void +size_metrictable(int *total, int *trees) +{ + int i, g, maxgroup = 0, nmetrics = 0; + + for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) { + cgroup_subsys_t *subsys = &controllers[i]; + + for (g = 0; g < subsys->group_count; g++) { + cgroup_group_t *group = &subsys->groups[g]; + + if (group->id > maxgroup) + maxgroup = group->id; + } + nmetrics += subsys->metric_count + 0; /* +1 for task.pid */ + } + + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "size_metrictable: %d total x %d trees\n", + nmetrics, maxgroup); + + *total = nmetrics; + *trees = maxgroup; +} + +/* + * Create new metric table entry for a group based on an existing one. + */ +static void +refresh_metrictable(pmdaMetric *source, pmdaMetric *dest, int gid) +{ + int domain = pmid_domain(source->m_desc.pmid); + int cluster = proc_pmid_cluster(source->m_desc.pmid); + int item = pmid_item(source->m_desc.pmid); + + memcpy(dest, source, sizeof(pmdaMetric)); + dest->m_desc.pmid = cgroup_pmid_build(domain, cluster, gid, item); + + if (pmDebug & DBG_TRACE_APPL1) + fprintf(stderr, "refresh_metrictable: (%p -> %p)\n", source, dest); + if (pmDebug & DBG_TRACE_APPL0) + fprintf(stderr, "cgroup metric ID dup: %d.[%d.%d].%d - %d.[%d.%d].%d\n", + domain, cluster, + cgroup_pmid_group(source->m_desc.pmid), + cgroup_pmid_metric(source->m_desc.pmid), + pmid_domain(dest->m_desc.pmid), + proc_pmid_cluster(dest->m_desc.pmid), + cgroup_pmid_group(dest->m_desc.pmid), + cgroup_pmid_metric(dest->m_desc.pmid)); +} + +static int +cgroup_text(pmdaExt *pmda, pmID pmid, int type, char **buf) +{ + return PM_ERR_TEXT; +} + +static void +cgroup_metrics_init(pmdaMetric *metrics, int nmetrics) +{ + int i, j, item, cluster = 0; + + for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) { + cgroup_subsys_t *subsys = &controllers[i]; + + /* set initial default values for controller metrics item field */ + for (j = 0; j < subsys->metric_count; j++) + subsys->metrics[j].item = j; + + /* set initial seed values for dynamic PMIDs in global metric table */ + for (j = item = 0; j < nmetrics; j++) { + if (pmid_cluster(metrics[j].m_desc.pmid) == subsys->cluster) { + if (cluster != subsys->cluster) { + cluster = subsys->cluster; + item = 0; + } + metrics[j].m_desc.pmid = PMDA_PMID(cluster, item++); + } + } + } +} + +void +cgroup_init(pmdaMetric *metrics, int nmetrics) +{ + static int set[] = { + CLUSTER_BLKIO_GROUPS, + CLUSTER_CPUSET_GROUPS, + CLUSTER_CPUACCT_GROUPS, + CLUSTER_CPUSCHED_GROUPS, + CLUSTER_MEMORY_GROUPS, + CLUSTER_NET_CLS_GROUPS, + }; + + cgroup_metrics_init(metrics, nmetrics); + + pmdaDynamicPMNS(CGROUP_ROOT, + set, sizeof(set) / sizeof(set[0]), + refresh_cgroups, cgroup_text, + refresh_metrictable, size_metrictable, + metrics, nmetrics); + pmdaDynamicSetClusterMask(CGROUP_ROOT, CGROUP_MASK); +} diff --git a/src/pmdas/linux_proc/cgroups.h b/src/pmdas/linux_proc/cgroups.h new file mode 100644 index 0000000..d2ec430 --- /dev/null +++ b/src/pmdas/linux_proc/cgroups.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013-2014 Red Hat. + * Copyright (c) 2010 Aconex. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#ifndef _CGROUP_H +#define _CGROUP_H + +/* + * Note: cgroup metrics have an "extra" component - the cluster part + * of the PMID (12 bits) is split into two (6 bits each): the bottom + * part contains the regular metric (cluster) ID while the top holds + * the cgroup ID (index - e.g. this is the 3rd cgroup we've seen for + * a particular subsystem). + */ + +#define CGROUP_SPLIT 6 +#define CGROUP_MASK ((1 << CGROUP_SPLIT) - 1) + +static inline pmID +cgroup_pmid_build(unsigned int domain, unsigned int cluster, + unsigned int gid, unsigned int metric) +{ + return pmid_build(domain, (gid << CGROUP_SPLIT) | cluster, metric); +} + +static inline unsigned int +cgroup_pmid_group(pmID id) +{ + return pmid_cluster(id) >> CGROUP_SPLIT; +} + +static inline unsigned int +proc_pmid_cluster(pmID id) +{ + return pmid_cluster(id) & CGROUP_MASK; +} + +static inline unsigned int +cgroup_pmid_metric(pmID id) +{ + return pmid_item(id); +} + +/* + * General cgroup interfaces + */ +extern void cgroup_init(pmdaMetric *, int); +extern char *cgroup_find_subsys(pmInDom, void *); +extern int cgroup_group_fetch(pmID, unsigned int, pmAtomValue *); + +/* + * Metric name and value refresh interfaces + */ +extern int refresh_cgroups(pmdaExt *, __pmnsTree **); + +/* + * Indom-specific interfaces + */ +extern void refresh_cgroup_cpus(pmInDom); +extern void refresh_cgroup_devices(pmInDom); +extern void refresh_cgroup_filesys(pmInDom); +extern void refresh_cgroup_subsys(pmInDom); + +#endif /* _CGROUP_H */ diff --git a/src/pmdas/linux_proc/clusters.h b/src/pmdas/linux_proc/clusters.h new file mode 100644 index 0000000..e1c8c2a --- /dev/null +++ b/src/pmdas/linux_proc/clusters.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013-2014 Red Hat. + * Copyright (c) 2005,2007-2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#ifndef _CLUSTERS_H +#define _CLUSTERS_H + +/* + * fetch cluster numbers ... to manage the PMID migration after the + * linux -> linux + proc PMDAs split, these need to match the enum + * assigned values for CLUSTER_* from the linux PMDA. + */ +#define CLUSTER_PID_STAT 8 /* /proc/<pid>/stat */ +#define CLUSTER_PID_STATM 9 /* /proc/<pid>/statm + /proc/<pid>/maps */ +#define CLUSTER_CONTROL 10 /* instance + value fetch control metrics */ +#define CLUSTER_PID_CGROUP 11 /* /proc/<pid>/cgroup */ +#define CLUSTER_PID_LABEL 12 /* /proc/<pid>/attr/current (label) */ +#define CLUSTER_PROC_RUNQ 13 /* number of processes in various states */ +#define CLUSTER_PID_STATUS 24 /* /proc/<pid>/status */ +#define CLUSTER_PID_SCHEDSTAT 31 /* /proc/<pid>/schedstat */ +#define CLUSTER_PID_IO 32 /* /proc/<pid>/io */ +#define CLUSTER_CGROUP_SUBSYS 37 /* /proc/cgroups control group subsystems */ +#define CLUSTER_CGROUP_MOUNTS 38 /* /proc/mounts active control groups */ +#define CLUSTER_CPUSET_GROUPS 39 /* cpuset control groups */ +#define CLUSTER_CPUACCT_GROUPS 41 /* cpu accounting control groups */ +#define CLUSTER_CPUSCHED_GROUPS 43 /* scheduler control groups */ +#define CLUSTER_MEMORY_GROUPS 45 /* memory control groups */ +#define CLUSTER_NET_CLS_GROUPS 47 /* network classification control groups */ +#define CLUSTER_BLKIO_GROUPS 49 /* blkio control groups */ +#define CLUSTER_PID_FD 51 /* /proc/<pid>/fd */ + /* Note: do not use higher than (1 << CGROUP_SPLIT)-1 as cluster ID */ + +#define MIN_CLUSTER 8 /* first cluster number we use here */ +#define NUM_CLUSTERS 52 /* one more than highest cluster number used */ +#define MAX_CLUSTER 63 /* last available - fill gaps if more needed */ + +#endif /* _CLUSTERS_H */ diff --git a/src/pmdas/linux_proc/contexts.c b/src/pmdas/linux_proc/contexts.c new file mode 100644 index 0000000..f213c14 --- /dev/null +++ b/src/pmdas/linux_proc/contexts.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2013 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include "contexts.h" + +static proc_perctx_t *ctxtab; +static int num_ctx; +static uid_t baseuid; +static gid_t basegid; + +static void +proc_ctx_clear(int ctx) +{ + ctxtab[ctx].state = CTX_INACTIVE; + ctxtab[ctx].uid = -1; + ctxtab[ctx].gid = -1; + ctxtab[ctx].threads = 1; + ctxtab[ctx].cgroups = NULL; +} + +void +proc_ctx_end(int ctx) +{ + if (ctx < 0 || ctx >= num_ctx || ctxtab[ctx].state == CTX_INACTIVE) + return; + if (ctxtab[ctx].state & CTX_CGROUPS) + free((void *)ctxtab[ctx].cgroups); + proc_ctx_clear(ctx); +} + +static void +proc_ctx_growtab(int ctx) +{ + size_t need; + + if (ctx < num_ctx) + return; + + need = (ctx + 1) * sizeof(ctxtab[0]); + ctxtab = (proc_perctx_t *)realloc(ctxtab, need); + if (ctxtab == NULL) + __pmNoMem("proc ctx table", need, PM_FATAL_ERR); + while (num_ctx <= ctx) + proc_ctx_clear(num_ctx++); +} + +static void +proc_ctx_set_userid(int ctx, const char *value) +{ + proc_ctx_growtab(ctx); + ctxtab[ctx].uid = atoi(value); + ctxtab[ctx].state |= (CTX_ACTIVE | CTX_USERID); +} + +static void +proc_ctx_set_groupid(int ctx, const char *value) +{ + proc_ctx_growtab(ctx); + ctxtab[ctx].gid = atoi(value); + ctxtab[ctx].state |= (CTX_ACTIVE | CTX_GROUPID); +} + +int +proc_ctx_attrs(int ctx, int attr, const char *value, int length, pmdaExt *pmda) +{ + if (pmDebug & DBG_TRACE_AUTH) { + char buffer[256]; + + if (!__pmAttrStr_r(attr, value, buffer, sizeof(buffer))) { + __pmNotifyErr(LOG_ERR, "Bad Attribute: ctx=%d, attr=%d\n", ctx, attr); + } else { + buffer[sizeof(buffer)-1] = '\0'; + __pmNotifyErr(LOG_INFO, "Attribute: ctx=%d %s", ctx, buffer); + } + } + + switch (attr) { + case PCP_ATTR_USERID: + proc_ctx_set_userid(ctx, value); + break; + case PCP_ATTR_GROUPID: + proc_ctx_set_groupid(ctx, value); + break; + default: + break; + } + return 0; +} + +void +proc_ctx_init(void) +{ + baseuid = getuid(); + basegid = getgid(); +} + +int +proc_ctx_access(int ctx) +{ + proc_perctx_t *pp; + int accessible = 0; + + if (ctx < 0 || ctx >= num_ctx) + return accessible; + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return accessible; + + if (pp->state & CTX_GROUPID) { + accessible++; + if (basegid != pp->gid) { + if (setegid(pp->gid) < 0) { + __pmNotifyErr(LOG_ERR, "setegid(%d) access failed: %s\n", + pp->gid, osstrerror()); + accessible--; + } + } + } + if (pp->state & CTX_USERID) { + accessible++; + if (baseuid != pp->uid) { + if (seteuid(pp->uid) < 0) { + __pmNotifyErr(LOG_ERR, "seteuid(%d) access failed: %s\n", + pp->uid, osstrerror()); + accessible--; + } + } + } + return (accessible > 1); +} + +int +proc_ctx_revert(int ctx) +{ + proc_perctx_t *pp; + + if (ctx < 0 || ctx >= num_ctx) + return 0; + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return 0; + + if ((pp->state & CTX_USERID) && baseuid != pp->uid) { + if (seteuid(baseuid) < 0) + __pmNotifyErr(LOG_ERR, "seteuid(%d) revert failed: %s\n", + baseuid, osstrerror()); + } + if ((pp->state & CTX_GROUPID) && basegid != pp->gid) { + if (setegid(basegid) < 0) + __pmNotifyErr(LOG_ERR, "setegid(%d) revert failed: %s\n", + basegid, osstrerror()); + } + return 0; +} + +unsigned int +proc_ctx_threads(int ctx, unsigned int threads) +{ + proc_perctx_t *pp; + + if (ctx < 0 || ctx >= num_ctx) + return threads; /* fallback to default */ + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return threads; /* fallback to default */ + + if (pp->state & CTX_THREADS) + return pp->threads; /* client setting */ + + return threads; /* fallback to default */ +} + +int +proc_ctx_set_threads(int ctx, unsigned int threads) +{ + proc_perctx_t *pp; + + if (ctx < 0 || ctx >= num_ctx) + return PM_ERR_NOCONTEXT; + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return PM_ERR_NOCONTEXT; + if (threads > 1) + return PM_ERR_CONV; + + pp->state |= CTX_THREADS; + pp->threads = threads; + return 0; +} + +const char * +proc_ctx_cgroups(int ctx, const char *cgroups) +{ + proc_perctx_t *pp; + + if (ctx < 0 || ctx >= num_ctx) + return cgroups; /* fallback to default */ + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return cgroups; /* fallback to default */ + + if (pp->state & CTX_CGROUPS) + return pp->cgroups; /* client setting */ + + return cgroups; /* fallback to default */ +} + +int +proc_ctx_set_cgroups(int ctx, const char *cgroups) +{ + proc_perctx_t *pp; + + if (ctx < 0 || ctx >= num_ctx) + return PM_ERR_NOCONTEXT; + pp = &ctxtab[ctx]; + if (pp->state == CTX_INACTIVE) + return PM_ERR_NOCONTEXT; + if (cgroups == NULL || cgroups[0] == '\0') + return PM_ERR_CONV; + + pp->state |= CTX_CGROUPS; + pp->cgroups = cgroups; + return 0; +} diff --git a/src/pmdas/linux_proc/contexts.h b/src/pmdas/linux_proc/contexts.h new file mode 100644 index 0000000..c2abe8c --- /dev/null +++ b/src/pmdas/linux_proc/contexts.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#ifndef _CONTEXTS_H +#define _CONTEXTS_H + +/* + * Handle newly arriving clients, security attributes being set on 'em, + * switching to alternative accounts (temporarily) and back, and client + * termination. State maintained in a global table, with a high-water + * allocator and active/inactive entry tracking. + * + * The proc.control.perclient metrics also have state tracked here now. + */ + +enum { + CTX_INACTIVE = 0x0, + CTX_ACTIVE = 0x1, + CTX_USERID = 0x2, + CTX_GROUPID = 0x4, + CTX_THREADS = 0x8, + CTX_CGROUPS = 0x10, +}; + +typedef struct { + unsigned int state; + uid_t uid; + gid_t gid; + unsigned int threads; + const char *cgroups; +} proc_perctx_t; + +extern void proc_ctx_init(void); +extern int proc_ctx_attrs(int, int, const char *, int, pmdaExt *); +extern void proc_ctx_end(int); + +extern int proc_ctx_access(int); +extern int proc_ctx_revert(int); + +extern unsigned int proc_ctx_threads(int, unsigned int); +extern int proc_ctx_set_threads(int, unsigned int); + +extern const char *proc_ctx_cgroups(int, const char *); +extern int proc_ctx_set_cgroups(int, const char *); + +#endif /* _CONTEXTS_H */ diff --git a/src/pmdas/linux_proc/getinfo.c b/src/pmdas/linux_proc/getinfo.c new file mode 100644 index 0000000..b4633a5 --- /dev/null +++ b/src/pmdas/linux_proc/getinfo.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 Aconex. All Rights Reserved. + * Copyright (c) 2000,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include <sys/stat.h> +#include <sys/dir.h> +#include <ctype.h> +#include <fcntl.h> +#include "pmapi.h" + +char * +get_ttyname_info(int pid, dev_t dev, char *ttyname) +{ + DIR *dir; + struct dirent *dp; + struct stat sbuf; + int found=0; + char procpath[MAXPATHLEN]; + char ttypath[MAXPATHLEN]; + + sprintf(procpath, "/proc/%d/fd", pid); + if ((dir = opendir(procpath)) != NULL) { + while ((dp = readdir(dir)) != NULL) { + if (!isdigit((int)dp->d_name[0])) + continue; + sprintf(procpath, "/proc/%d/fd/%s", pid, dp->d_name); + if (realpath(procpath, ttypath) == NULL || stat(ttypath, &sbuf) < 0) + continue; + if (S_ISCHR(sbuf.st_mode) && dev == sbuf.st_rdev) { + found=1; + break; + } + } + closedir(dir); + } + + if (!found) + strcpy(ttyname, "?"); + else + /* skip the "/dev/" prefix */ + strcpy(ttyname, &ttypath[5]); + + return ttyname; +} diff --git a/src/pmdas/linux_proc/getinfo.h b/src/pmdas/linux_proc/getinfo.h new file mode 100644 index 0000000..9006c00 --- /dev/null +++ b/src/pmdas/linux_proc/getinfo.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 Aconex. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +extern char *get_ttyname_info(int, dev_t, char *); + diff --git a/src/pmdas/linux_proc/help b/src/pmdas/linux_proc/help new file mode 100644 index 0000000..6640a08 --- /dev/null +++ b/src/pmdas/linux_proc/help @@ -0,0 +1,220 @@ +# +# Copyright (c) 2000,2004-2008 Silicon Graphics, Inc. All Rights Reserved. +# Portions Copyright (c) International Business Machines Corp., 2002 +# Portions Copyright (c) 2007-2009 Aconex. All Rights Reserved. +# Portions Copyright (c) 2013 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Linux proc PMDA help file in the ASCII format +# +# lines beginning with a # are ignored +# lines beginning @ introduce a new entry of the form +# @ metric_name oneline-text +# help test goes +# here over multiple lines +# ... +# +# the metric_name is decoded against the default PMNS -- as a special case, +# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an +# instance domain identification, and the text describes the instance domain +# +# blank lines before the @ line are ignored +# + +@ cgroup.subsys.hierarchy subsystem hierarchy from /proc/cgroups +@ cgroup.subsys.count count of known subsystems in /proc/cgroups +@ cgroup.mounts.subsys mount points for each cgroup subsystem +@ cgroup.mounts.count count of cgroup filesystem mount points + +@ proc.nprocs instantaneous number of processes +@ proc.psinfo.pid process identifier +@ proc.psinfo.psargs full command string +@ proc.psinfo.cmd command name +@ proc.psinfo.sname process state identifier (see ps(1)). See also proc.runq metrics. +@ proc.psinfo.ppid parent process identifier +@ proc.psinfo.pgrp process group identifier +@ proc.psinfo.session process session identifier +@ proc.psinfo.tty controlling tty device number (zero if none) +@ proc.psinfo.tty_pgrp controlling tty process group identifier +@ proc.psinfo.flags process state flags, as a bitmap +@ proc.psinfo.minflt count of minor page faults (i.e. reclaims) +@ proc.psinfo.cmin_flt count of minor page faults (i.e. reclaims) of all exited children +@ proc.psinfo.maj_flt count of page faults other than reclaims +@ proc.psinfo.cmaj_flt count of page faults other than reclaims of all exited children +@ proc.psinfo.utime time (in ms) spent executing user code since process started +@ proc.psinfo.stime time (in ms) spent executing system code (calls) since process started +@ proc.psinfo.cutime time (in ms) spent executing user code of all exited children +@ proc.psinfo.cstime time (in ms) spent executing system code of all exited children +@ proc.psinfo.priority priority value +@ proc.psinfo.nice process nice value (negative nice values are lower priority) +@ proc.psinfo.it_real_value current interval timer value (zero if none) +@ proc.psinfo.start_time start time of the process relative to system boot time in seconds +@ proc.psinfo.vsize virtual size of the process in Kbytes +@ proc.psinfo.rss resident set size (i.e. physical memory) of the process +@ proc.psinfo.rss_rlim limit on resident set size of process +@ proc.psinfo.start_code address of the start of the code segment for the process +@ proc.psinfo.end_code address of the end of the code segment for the process +@ proc.psinfo.start_stack address of the stack segment for the process +@ proc.psinfo.esp the value in the esp field of struct task_struct for the process +@ proc.psinfo.eip the value in the eip field of struct task_struct for the process +@ proc.psinfo.signal the value in the signal field of struct task_struct for the process +@ proc.psinfo.blocked the value in the blocked field of struct task_struct for the process +@ proc.psinfo.sigignore the value in the sigignore field of struct task_struct for the process +@ proc.psinfo.sigcatch the value in the sigcatch field of struct task_struct for the process +@ proc.psinfo.wchan wait channel, kernel address this process is blocked or sleeping on +@ proc.psinfo.nswap count of page swap operations +@ proc.psinfo.cnswap count of page swap operations of all exited children +@ proc.psinfo.exit_signal the value in the exit_signal field of struct task_struct for the process +@ proc.psinfo.ttyname name of controlling tty device, or "?" if none. See also proc.psinfo.tty. +@ proc.psinfo.processor last CPU the process was running on +@ proc.psinfo.wchan_s name of an event for which the process is sleeping (if blank, the process is running). +This field needs access to a namelist file for proper +address-to-symbol name translation. If no namelist file +is available, the address is printed instead. The namelist +file must match the current Linux kernel exactly. +The search path for the namelist file is as follows: + /boot/System.map-`uname -r` + /boot/System.map + /lib/modules/`uname -r`/System.map + /usr/src/linux/System.map + /System.map +@ proc.psinfo.signal_s pending signals mask in string form (from /proc/<pid>/status) +@ proc.psinfo.blocked_s blocked signals mask in string form (from /proc/<pid>/status) +@ proc.psinfo.sigignore_s ignored signals mask in string form (from /proc/<pid>/status) +@ proc.psinfo.sigcatch_s caught signals mask in string form (from /proc/<pid>/status) +@ proc.psinfo.threads number of threads (from /proc/<pid>/status) +@ proc.psinfo.cgroups list of processes cgroups (from /proc/<pid>/cgroup) +@ proc.psinfo.labels list of processes security labels (from /proc/<pid>/attr/current) +@ proc.memory.size instantaneous virtual size of process, excluding page table and task structure. +@ proc.memory.rss instantaneous resident size of process, excluding page table and task structure. +@ proc.memory.share instantaneous amount of memory shared by this process with other processes +@ proc.memory.textrss instantaneous resident size of process code segment in Kbytes +@ proc.memory.librss instantaneous resident size of library code mapped by the process, in Kbytes +@ proc.memory.datrss instantaneous resident size of process data segment, in Kbytes +@ proc.memory.dirty instantaneous amount of memory that has been modified by the process, in Kbytes +@ proc.memory.maps table of memory mapped by process in string form from /proc/<pid>/maps +@ proc.memory.vmsize total virtual memory (from /proc/<pid>/status) +@ proc.memory.vmlock locked virtual memory (from /proc/<pid>/status) +@ proc.memory.vmrss resident virtual memory (from /proc/<pid>/status) +@ proc.memory.vmdata virtual memory used for data (from /proc/<pid>/status) +@ proc.memory.vmstack virtual memory used for stack (from /proc/<pid>/status) +@ proc.memory.vmexe virtual memory used for non-library executable code (from /proc/<pid>/status) +@ proc.memory.vmlib virtual memory used for libraries (from /proc/<pid>/status) +@ proc.memory.vmswap virtual memory that has been brought in and out. +@ proc.id.uid real user ID from /proc/<pid>/status +@ proc.id.euid effective user ID from /proc/<pid>/status +@ proc.id.suid saved user ID from /proc/<pid>/status +@ proc.id.fsuid filesystem user ID from /proc/<pid>/status +@ proc.id.gid real group ID from /proc/<pid>/status +@ proc.id.egid effective group ID from /proc/<pid>/status +@ proc.id.sgid saved group ID from /proc/<pid>/status +@ proc.id.fsgid filesystem group ID from /proc/<pid>/status +@ proc.id.uid_nm real user name based on real user ID from /proc/<pid>/status +@ proc.id.euid_nm effective user name based on effective user ID from /proc/<pid>/status +@ proc.id.suid_nm saved user name based on saved user ID from /proc/<pid>/status +@ proc.id.fsuid_nm filesystem user name based on filesystem user ID from /proc/<pid>/status +@ proc.id.gid_nm real group name based on real group ID from /proc/<pid>/status +@ proc.id.egid_nm effective group name based on effective group ID from /proc/<pid>/status +@ proc.id.sgid_nm saved group name based on saved group ID from /proc/<pid>/status +@ proc.id.fsgid_nm filesystem group name based on filesystem group ID from /proc/<pid>/status + +@ proc.runq.runnable number of runnable (on run queue) processes +Instantaneous number of runnable (on run queue) processes, state 'R' in ps +@ proc.runq.blocked number of processes in uninterruptible sleep +Instantaneous number of processes in uninterruptible sleep, state 'D' in ps +@ proc.runq.sleeping number of processes sleeping +Instantaneous number of processes sleeping, state 'S' in ps +@ proc.runq.stopped number of traced, stopped or suspended processes +Instantaneous number of traced, stopped or suspended processes, state +'T' in ps +@ proc.runq.swapped number of processes that are swapped +Instantaneous number of processes (excluding kernel threads) that are +swapped, state 'SW' in ps +@ proc.runq.defunct number of defunct/zombie processes +Instantaneous number of defunct/zombie processes, state 'Z' in ps +@ proc.runq.unknown number of processes is an unknown state +Instantaneous number of processes is an unknown state, including all +kernel threads +@ proc.runq.kernel number of kernel threads +Instantaneous number of processes with virtual size of zero (kernel threads) + +@ proc.io.rchar read(), readv() and sendfile() receive bytes +Extended accounting information - count of the number of bytes that +have passed over the read(2), readv(2) and sendfile(2) syscalls by +each process. + +@ proc.io.wchar write(), writev() and sendfile() send bytes +Extended accounting information - count of the number of bytes that +have passed over the write(2), writev(2) and sendfile(2) syscalls by +each process. + +@ proc.io.syscr read(), readv() and sendfile() receive system calls +Extended accounting information - count of number of calls to the +read(2), readv(2) and sendfile(2) syscalls by each process. + +@ proc.io.syscw write(), writev() and sendfile() send system calls +Extended accounting information - count of number of calls to the +write(2), writev(2) and sendfile(2) syscalls by each process. + +@ proc.io.read_bytes physical device read bytes +Number of bytes physically read on by devices on behalf of this process. +@ proc.io.write_bytes physical device write bytes +Number of bytes physically written to devices on behalf of this process. +This must be reduced by any truncated I/O (proc.io.cancelled_write_bytes). +@ proc.io.cancelled_write_bytes physical device write cancelled bytes +Number of bytes cancelled via truncate by this process. Actual physical +writes for an individual process can be calculated as: + proc.io.write_bytes - proc.io.cancelled_write_bytes. + +@ proc.schedstat.cpu_time runnable (scheduled) + run time +Length of time in nanoseconds that a process has been running, including +scheduling time. +@ proc.schedstat.run_delay run queue time +Length of time in nanoseconds that a process spent waiting to be scheduled +to run in the run queue. +@ proc.schedstat.pcount number of times a process is allowed to run +Number of times a process has been scheduled to run on a CPU (this is +incremented when a task actually reaches a CPU to run on, not simply +when it is added to the run queue). + +@ proc.fd.count open file descriptors +Number of file descriptors this process has open. + +@ proc.control.all.threads process indom includes threads +If set to one, the process instance domain as reported by pmdaproc +contains all threads as well as the processes that started them. +If set to zero, the process instance domain contains only processes. + +This setting is persistent for the life of pmdaproc and affects all +client tools that request instances and values from pmdaproc. +Use either pmstore(1) or pmStore(3) to modify this metric. + +@ proc.control.perclient.threads for a client, process indom includes threads +If set to one, the process instance domain as reported by pmdaproc +contains all threads as well as the processes that started them. +If set to zero, the process instance domain contains only processes. + +This setting is only visible to the active client context. In other +words, storing into this metric has no effect for other monitoring +tools. See proc.control.all.threads, if that is the desired outcome. +Only pmStore(3) can effectively set this metric (pmstore(1) cannot). + +@ proc.control.perclient.cgroups for a client, process indom reflects specific cgroups +If set to the empty string (the default), the process instance domain +as reported by pmdaproc contains all processes. However, a cgroup +name (full path) can be stored into this metric in order to restrict +processes reported to only those within the specified cgroup. This +set is further affected by the value of proc.control.perclient.threads. + +This setting is only visible to the active client context. In other +words, storing into this metric has no effect for other monitoring +tools. pmStore(3) must be used to set this metric (not pmstore(1)). diff --git a/src/pmdas/linux_proc/indom.h b/src/pmdas/linux_proc/indom.h new file mode 100644 index 0000000..9c928cd --- /dev/null +++ b/src/pmdas/linux_proc/indom.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2012-2014 Red Hat. + * Copyright (c) 2010 Aconex. All Rights Reserved. + * Copyright (c) 2005,2007-2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#ifndef _INDOM_H +#define _INDOM_H + +/* + * indom serial numbers ... to manage the indom migration after the + * linux -> linux + proc PMDAs split, these need to match the enum + * assigned values for *_INDOM from the linux PMDA. Consequently, + * the proc indom table is sparse. + */ +#define CPU_INDOM 0 /* - percpu */ +#define DISK_INDOM 1 /* - disks (with normal names) */ +#define DEVT_INDOM 2 /* - disks (major:minor names) */ +#define PROC_INDOM 9 /* - processes */ +#define STRINGS_INDOM 10 /* - fake indom, string hash */ +#define CGROUP_SUBSYS_INDOM 20 /* - control group subsystems */ +#define CGROUP_MOUNTS_INDOM 21 /* - control group mounts */ + +#define MIN_INDOM 0 /* first indom number we use here */ +#define NUM_INDOMS 22 /* one more than highest indom number we use here */ + +extern pmInDom proc_indom(int); +#define INDOM(i) proc_indom(i) + +/* + * Optional path prefix for all stats files, used for testing. + */ +extern char *proc_statspath; +extern FILE *proc_statsfile(const char *, char *, int); + +/* + * static string dictionary - one copy of oft-repeated strings; + * implemented using STRINGS_INDOM and pmdaCache(3) routines. + */ +char *proc_strings_lookup(int); +int proc_strings_insert(const char *); + +#endif /* _INDOM_H */ diff --git a/src/pmdas/linux_proc/ksym.c b/src/pmdas/linux_proc/ksym.c new file mode 100644 index 0000000..1604c84 --- /dev/null +++ b/src/pmdas/linux_proc/ksym.c @@ -0,0 +1,564 @@ +/* + * Copyright (c) International Business Machines Corp., 2002 + * Copyright (c) 2003,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +/* + * This code originally contributed by Mike Mason <mmlnx@us.ibm.com> + * with hints from the procps and ksymoops projects. + */ + +#include <ctype.h> +#include <limits.h> +#include <sys/time.h> +#include <sys/utsname.h> +#include "pmapi.h" +#include "impl.h" +#include "ksym.h" +#include "indom.h" + +static struct ksym *ksym_a; +static size_t ksym_a_sz; + +static int +find_index(__psint_t addr, int lo, int hi) +{ + int mid; + + if (lo > hi) { + return -1; + } + + mid = lo + ((hi - lo) / 2); + if (addr == ksym_a[mid].addr || + (addr > ksym_a[mid].addr && addr < ksym_a[mid+1].addr)) { + return mid; + } + + if (addr > ksym_a[mid].addr) + return find_index(addr, mid+1, hi); + else + return find_index(addr, lo, mid-1); +} + +static char * +find_name_by_addr(__psint_t addr) +{ + int ix = -1; + + if (ksym_a) + ix = find_index(addr, 0, ksym_a_sz - 1); + if (ix < 0) + return NULL; + + return ksym_a[ix].name; +} + +static int +find_dup_name(int maxix, __psint_t addr, char *name) +{ + int i, res; + + for (i = 0; i < maxix; i++) { + if (ksym_a[i].name) { + res = strcmp(ksym_a[i].name, name); + if (res > 0) + break; + if (res == 0) { + if (addr == ksym_a[i].addr) + return KSYM_FOUND; + else + return KSYM_FOUND_MISMATCH; + } + } + } + + return KSYM_NOT_FOUND; +} + +/* Brute force linear search to determine if the kernel version + in System.map matches the running kernel version and returns + a tri-state result as follows: + + 0 no match + 1 _end not found but version matched + 2 _end found and matched + */ +static int +validate_sysmap(FILE *fp, char *version, __psint_t end_addr) +{ + __psint_t addr; + char type; + int ret = 0; + char kname[128]; + + while (fscanf(fp, "%p %c %s", (void **)&addr, &type, kname) != EOF) { + if (end_addr && strcmp(kname, "_end") == 0) { + ret = (end_addr == addr) ? 2 : 0; + break; /* no need to look any further */ + } + if (strcmp(kname, version) == 0) + ret = 1; + } + + return ret; +} + +char * +wchan(__psint_t addr) +{ + static char zero; + char *p = NULL; + + if (addr == 0) /* 0 address means not in kernel space */ + p = &zero; + else if ((p = find_name_by_addr(addr))) { + /* strip off "sys_" or leading "_"s if necessary */ + if (strncmp(p, "sys_", 4) == 0) + p += 4; + while (*p == '_' && *p) + ++p; + } + + return p; +} + +static int +ksym_compare_addr(const void *e1, const void *e2) +{ + struct ksym *ks1 = (struct ksym *) e1; + struct ksym *ks2 = (struct ksym *) e2; + + if (ks1->addr < ks2->addr) + return -1; + if (ks1->addr > ks2->addr) + return 1; + return 0; +} + +static int +ksym_compare_name(const void *e1, const void *e2) +{ + struct ksym *ks1 = (struct ksym *) e1; + struct ksym *ks2 = (struct ksym *) e2; + + return(strcmp(ks1->name, ks2->name)); +} + +static int +read_ksyms(__psint_t *end_addr) +{ + char inbuf[256]; + char *ip; + char *sp; + char *tp; + char *p; + int ix = 0; + int l = 0; + int len; + int err; + FILE *fp; + struct ksym *ksym_tmp; + + *end_addr = 0; + if ((fp = proc_statsfile("/proc/ksyms", inbuf, sizeof(inbuf))) == NULL) + return -oserror(); + + while (fgets(inbuf, sizeof(inbuf), fp) != NULL) { + l++; + + /* + * /proc/ksyms lines look like this on ia32 ... + * + * c8804060 __insmod_rtc_S.text_L4576 [rtc] + * c010a320 disable_irq_nosync + * + * else on ia64 ... + * + * a0000000003e0d28 debug [arsess] + * e002100000891140 disable_irq_nosync + */ + + if (strstr(inbuf, "\n") == NULL) { + fprintf(stderr, "read_ksyms: truncated /proc/ksyms line [%d]: %s\n", l-1, inbuf); + continue; + } + + /* Increase array size, if necessary */ + if (ksym_a_sz < ix+1) { + if (ksym_a_sz > 0) + ksym_a_sz += INCR_KSIZE; + else + ksym_a_sz = INIT_KSIZE; + ksym_tmp = (struct ksym *)realloc(ksym_a, ksym_a_sz * sizeof(struct ksym)); + if (ksym_tmp == NULL) { + err = -oserror(); + free(ksym_a); + fclose(fp); + return err; + } + ksym_a = ksym_tmp; + } + + ip = inbuf; + /* parse over address */ + while (isxdigit((int)*ip)) ip++; + + if (!isspace((int)*ip) || ip-inbuf < 4) { + /* bad format line */ +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "read_ksyms: bad addr? %c[%d] line=\"%s\"\n", *ip, (int)(ip-inbuf), inbuf); + } +#endif + continue; + } + + sscanf(inbuf, "%p", (void **)&ksym_a[ix].addr); + + while (isblank((int)*ip)) ip++; + + /* next should be the symbol name */ + sp = ip++; + while (!isblank((int)*ip) &&*ip != '\n') ip++; + + /* strip off GPLONLY_ prefix, if found */ + if (strncmp(sp, "GPLONLY_", 8) == 0) + sp += 8; + + /* + * strip off symbol version suffix, if found ... looking for + * trailing pattern of the form _R.*[0-9a-fA-F]{8,} + * - find rightmost _R, if any + */ + tp = sp; + while ((p = strstr(tp, "_R")) != NULL) tp = p+2; + if (tp > sp) { + /* + * found _R, need the last 8 digits to be hex + */ + if (ip - tp + 1 >= 8) { + for (p = &ip[-8]; p < ip; p++) { + if (!isxdigit((int)*p)) { + tp = sp; + break; + } + } + } + else { + /* not enough characters for [0-9a-fA-f]{8,} at the end */ + tp = sp; + } + } + if (tp > sp) + /* need to strip the trailing _R.*[0-9a-fA-f]{8,} */ + len = tp - sp - 2; + else + len = ip - sp + 1; + + ksym_a[ix].name = strndup(sp, len); + if (ksym_a[ix].name == NULL) { + err = -oserror(); + fclose(fp); + return err; + } + ksym_a[ix].name[len-1] = '\0'; + + if (*end_addr == 0 && strcmp(ksym_a[ix].name, "_end") == 0) + *end_addr = ksym_a[ix].addr; + + if (*ip == '\n') + /* nothing after the symbol name, so no module name */ + goto next; + + while (isblank((int)*ip)) ip++; + + /* next expect module name */ + if (*ip != '[') { +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "read_ksyms: bad start module name %c[%d] != [ line=\"%s\"\n", *ip, (int)(ip-inbuf), inbuf); + } +#endif + free(ksym_a[ix].name); + continue; + } + + sp = ++ip; + while (!isblank((int)*ip) && *ip != ']') ip++; + + if (*ip != ']') { +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "read_ksyms: bad end module name %c[%d] != ] line=\"%s\"\n", *ip, (int)(ip-inbuf), inbuf); + } +#endif + free(ksym_a[ix].name); + continue; + } + + ksym_a[ix].module = strndup(sp, ip - sp + 1); + if (ksym_a[ix].module == NULL) { + err = -oserror(); + fclose(fp); + free(ksym_a[ix].name); + return err; + } + ksym_a[ix].module[ip - sp] = '\0'; + +next: + ix++; + } + + /* release unused ksym array entries */ + if (ix) { + ksym_tmp = (struct ksym *)realloc(ksym_a, ix * sizeof(struct ksym)); + if (ksym_tmp == NULL) { + free(ksym_a); + fclose(fp); + return -oserror(); + } + ksym_a = ksym_tmp; + } + + ksym_a_sz = ix; + + qsort(ksym_a, ksym_a_sz, sizeof(struct ksym), ksym_compare_name); + + fclose(fp); + +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "symbols from ksyms ...\n"); + for (ix = 0; ix < ksym_a_sz; ix++) { + fprintf(stderr, "ksym[%d] " PRINTF_P_PFX "%p %s", ix, (void *)ksym_a[ix].addr, ksym_a[ix].name); + if (ksym_a[ix].module != NULL) fprintf(stderr, " [%s]", ksym_a[ix].module); + fprintf(stderr, "\n"); + } + } +#endif + + return ksym_a_sz; +} + +static int +read_sysmap(const char *release, __psint_t end_addr) +{ + char inbuf[256], path[MAXPATHLEN], **fmt; + struct ksym *ksym_tmp; + __psint_t addr; + int ix, res, e; + int l = 0; + char *ip; + char *sp; + int major, minor, patch; + FILE *fp; + char *bestpath = NULL; + int ksym_mismatch_count; + char *sysmap_paths[] = { /* Paths to check for System.map file */ + "%s/boot/System.map-%s", + "%s/boot/System.map", + "%s/lib/modules/%s/System.map", + "%s/usr/src/linux/System.map", + "%s/System.map", + NULL + }; + + /* Create version symbol name to look for in System.map */ + if (sscanf(release, "%d.%d.%d", &major, &minor, &patch) < 3 ) + return -1; + sprintf(inbuf, "Version_%u", KERNEL_VERSION(major, minor, patch)); + + /* + * Walk through System.map path list looking for one that matches + * either _end from /proc/ksyms or the uts version. + */ + for (fmt = sysmap_paths; *fmt; fmt++) { + snprintf(path, MAXPATHLEN, *fmt, proc_statspath, release); + if ((fp = fopen(path, "r"))) { + if ((e = validate_sysmap(fp, inbuf, end_addr)) != 0) { + if (e == 2) { + /* matched _end, so this is the right System.map */ + if (bestpath) + free(bestpath); + bestpath = strdup(path); + } + else + if (e == 1 && !bestpath) + bestpath = strdup(path); + } + fclose(fp); + if (e == 2) { + /* _end matched => don't look any further */ + break; + } + } + } + + if (bestpath) + fprintf(stderr, "NOTICE: using \"%s\" for kernel symbols map.\n", bestpath); + else { + /* Didn't find a valid System.map */ + fprintf(stderr, "Warning: Valid System.map file not found!\n"); + fprintf(stderr, "Warning: proc.psinfo.wchan_s symbol names cannot be derived!\n"); + fprintf(stderr, "Warning: Addresses will be returned for proc.psinfo.wchan_s instead!\n"); + /* Free symbol array */ + for (ix = 0; ix < ksym_a_sz; ix++) { + if (ksym_a[ix].name) + free(ksym_a[ix].name); + if (ksym_a[ix].module) + free(ksym_a[ix].module); + } + free(ksym_a); + ksym_a = NULL; + ksym_a_sz = 0; + return -1; + } + + /* scan the System map */ + if ((fp = proc_statsfile(bestpath, path, sizeof(path))) == NULL) + return -oserror(); + + ix = ksym_a_sz; + + /* Read each line in System.map */ + ksym_mismatch_count = 0; + while (fgets(inbuf, sizeof(inbuf), fp) != NULL) { + /* + * System.map lines look like this on ia32 ... + * + * c010a320 T disable_irq_nosync + * + * else on ia64 ... + * + * e002000000014c80 T disable_irq_nosync + */ + + if (strstr(inbuf, "\n") == NULL) { + fprintf(stderr, "read_sysmap: truncated System.map line [%d]: %s\n", l-1, inbuf); + continue; + } + + /* Increase array size, if necessary */ + if (ksym_a_sz < ix+1) { + ksym_a_sz += INCR_KSIZE; + ksym_tmp = (struct ksym *)realloc(ksym_a, ksym_a_sz * sizeof(struct ksym)); + if (ksym_tmp == NULL) { + free(ksym_a); + goto fail; + } + ksym_a = ksym_tmp; + } + + ip = inbuf; + /* parse over address */ + while (isxdigit((int)*ip)) ip++; + + if (!isspace((int)*ip) || ip-inbuf < 4) { + /* bad format line */ +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "read_sysmap: bad addr? %c[%d] line=\"%s\"\n", *ip, (int)(ip-inbuf), inbuf); + } +#endif + continue; + } + + sscanf(inbuf, "%p", (void **)&addr); + + while (isblank((int)*ip)) ip++; + + /* Only interested in symbol types that map to code addresses, + * so: t, T, W or A + */ + if (*ip != 't' && *ip != 'T' && *ip != 'W' && *ip != 'A') + continue; + + ip++; + while (isblank((int)*ip)) ip++; + + /* next should be the symbol name */ + sp = ip++; + while (!isblank((int)*ip) && *ip != '\n') ip++; + *ip = '\0'; + + /* Determine if symbol is already in ksym array. + If so, make sure the addresses match. */ + res = find_dup_name(ix - 1, addr, sp); + if (res == KSYM_NOT_FOUND) { /* add it */ + ksym_a[ix].name = strdup(sp); + if (ksym_a[ix].name == NULL) + goto fail; + ksym_a[ix].addr = addr; + ix++; + } + else if (res == KSYM_FOUND_MISMATCH) { + if (ksym_mismatch_count++ < KSYM_MISMATCH_MAX_ALLOWED) { + /* + * ia64 function pointer descriptors make this validation + * next to useless. So only report the first + * KSYM_MISMATCH_MAX_ALLOWED mismatches found. + */ + fprintf(stderr, "Warning: mismatch for \"%s\" between System.map" + " and /proc/ksyms.\n", sp); + } + } + } + + if (ksym_mismatch_count > KSYM_MISMATCH_MAX_ALLOWED) { + fprintf(stderr, "Warning: only reported first %d out of %d mismatches " + "between System.map and /proc/ksyms.\n", + KSYM_MISMATCH_MAX_ALLOWED, ksym_mismatch_count); + } + + /* release unused ksym array entries */ + ksym_tmp = (struct ksym *)realloc(ksym_a, ix * sizeof(struct ksym)); + if (ksym_tmp == NULL) { + free(ksym_a); + goto fail; + } + ksym_a = ksym_tmp; + ksym_a_sz = ix; + + qsort(ksym_a, ksym_a_sz, sizeof(struct ksym), ksym_compare_addr); + +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "symbols from ksyms + sysmap ...\n"); + for (ix = 0; ix < ksym_a_sz; ix++) { + fprintf(stderr, "ksym[%d] " PRINTF_P_PFX "%p %s", ix, (void *)ksym_a[ix].addr, ksym_a[ix].name); + if (ksym_a[ix].module != NULL) fprintf(stderr, " [%s]", ksym_a[ix].module); + fprintf(stderr, "\n"); + } + } +#endif + + fclose(fp); + + return ksym_a_sz; + +fail: + e = -oserror(); + if (fp) + fclose(fp); + return e; +} + +void +read_ksym_sources(const char *release) +{ + __psint_t end_addr; + + if (read_ksyms(&end_addr) > 0) /* read /proc/ksyms first */ + read_sysmap(release, end_addr); /* then System.map */ +} diff --git a/src/pmdas/linux_proc/ksym.h b/src/pmdas/linux_proc/ksym.h new file mode 100644 index 0000000..f328ca4 --- /dev/null +++ b/src/pmdas/linux_proc/ksym.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This code contributed by Mike Mason (mmlnx@us.ibm.com) + */ +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) + +#define INIT_KSIZE 8192 +#define INCR_KSIZE 2048 + +#define KSYM_FOUND_MISMATCH -1 +#define KSYM_NOT_FOUND 0 +#define KSYM_FOUND 1 + +#define KSYM_MISMATCH_MAX_ALLOWED 10 + +struct ksym { + __psint_t addr; + char *name; + char *module; +}; + +extern char *wchan(__psint_t); +extern void read_ksym_sources(const char *); + diff --git a/src/pmdas/linux_proc/linux_proc_migrate.conf b/src/pmdas/linux_proc/linux_proc_migrate.conf new file mode 100644 index 0000000..51190da --- /dev/null +++ b/src/pmdas/linux_proc/linux_proc_migrate.conf @@ -0,0 +1,55 @@ +# Copyright 2012 Red Hat, Inc. All Rights Reserved +# +# pmlogrewrite configuration for migrating archives containing proc metrics +# that were captured prior to the proc PMDA split-off from the Linux PMDA. +# +# Basically, the PMID domain changed from 60 (linux) to 3 (proc) but all +# cluster and item numbers remain unchanged. +# +# Note that the CPU indom is not migrated, even though it is +# used for cgroup.groups.cpuacct.[<group>.]usage_percpu and +# cgroup.groups.cpuacct.usage_percpu because these metrics use a +# the dynamic pmns. To migrate archives containing these metrics, +# a script would be needed to generate the pmlogwrite config based +# on the metric names actually present in the source archive. + +# +# Migrate instance domains +indom 60.9 { indom -> 3.9 } # per-process indom +indom 60.20 { indom -> 3.20 } # cgroup hierarchy indom +indom 60.21 { indom -> 3.21 } # cgroup mount subsys indom + +# +# Migrate the pmid domain for each cluster +metric 60.8.* { pmid -> 3.*.* } # CLUSTER_PID_STAT +metric 60.9.* { pmid -> 3.*.* } # CLUSTER_PID_STATM +metric 60.13.* { pmid -> 3.*.* } # CLUSTER_PROC_RUNQ +metric 60.24.* { pmid -> 3.*.* } # CLUSTER_PID_STATUS +metric 60.31.* { pmid -> 3.*.* } # CLUSTER_PID_SCHEDSTAT +metric 60.32.* { pmid -> 3.*.* } # CLUSTER_PID_IO +metric 60.51.* { pmid -> 3.*.* } # CLUSTER_PID_FD +metric 60.37.* { pmid -> 3.*.* } # CLUSTER_CGROUP_SUBSYS +metric 60.38.* { pmid -> 3.*.* } # CLUSTER_CGROUP_MOUNTS +metric 60.39.* { pmid -> 3.*.* } # CLUSTER_CPUSET_GROUPS +metric 60.40.* { pmid -> 3.*.* } # CLUSTER_CPUSET_PROCS +metric 60.41.* { pmid -> 3.*.* } # CLUSTER_CPUACCT_GROUPS +metric 60.42.* { pmid -> 3.*.* } # CLUSTER_CPUACCT_PROCS +metric 60.43.* { pmid -> 3.*.* } # CLUSTER_CPUSCHED_GROUPS +metric 60.44.* { pmid -> 3.*.* } # CLUSTER_CPUSCHED_PROCS +metric 60.45.* { pmid -> 3.*.* } # CLUSTER_MEMORY_GROUPS +metric 60.46.* { pmid -> 3.*.* } # CLUSTER_MEMORY_PROCS +metric 60.47.* { pmid -> 3.*.* } # CLUSTER_NET_CLS_GROUPS +metric 60.48.* { pmid -> 3.*.* } # CLUSTER_NET_CLS_PROCS + +# +# These two proc.io metrics were incorrectly classified +# +metric proc.io.rchar { + sem -> counter + units -> 1,0,0,BYTE,0,0 +} + +metric proc.io.wchar { + sem -> counter + units -> 1,0,0,BYTE,0,0 +} diff --git a/src/pmdas/linux_proc/pmda.c b/src/pmdas/linux_proc/pmda.c new file mode 100644 index 0000000..2d40a54 --- /dev/null +++ b/src/pmdas/linux_proc/pmda.c @@ -0,0 +1,1896 @@ +/* + * proc PMDA + * + * Copyright (c) 2000,2004,2007-2008 Silicon Graphics, Inc. All Rights Reserved. + * Portions Copyright (c) 2002 International Business Machines Corp. + * Portions Copyright (c) 2007-2011 Aconex. All Rights Reserved. + * Portions Copyright (c) 2012-2014 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include "domain.h" +#include "contexts.h" + +#include <ctype.h> +#include <unistd.h> +#include <sys/vfs.h> +#include <sys/stat.h> +#include <sys/times.h> +#include <sys/utsname.h> +#include <utmp.h> +#include <pwd.h> +#include <grp.h> + +#include "../linux/convert.h" +#include "clusters.h" +#include "indom.h" + +#include "getinfo.h" +#include "proc_pid.h" +#include "proc_runq.h" +#include "ksym.h" +#include "cgroups.h" + +/* globals */ +static int _isDSO = 1; /* for local contexts */ +static proc_pid_t proc_pid; +static struct utsname kernel_uname; +static proc_runq_t proc_runq; +static int all_access; /* =1 no access checks */ +static int have_access; /* =1 recvd uid/gid */ +static size_t _pm_system_pagesize; +static unsigned int threads; /* control.all.threads */ +static char * cgroups; /* control.all.cgroups */ + +char *proc_statspath = ""; /* optional path prefix for all stats files */ + +/* + * The proc instance domain table is direct lookup and sparse. + * It is initialized in proc_init(), see below. + */ +static pmdaIndom indomtab[NUM_INDOMS]; + +/* + * all metrics supported in this PMDA - one table entry for each + */ +static pmdaMetric metrictab[] = { + +/* + * proc/<pid>/stat cluster + */ + +/* proc.nprocs */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,99), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.pid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,0), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.cmd */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,1), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.sname */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,2), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.ppid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,3), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.pgrp */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,4), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.session */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,5), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.tty */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,6), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.tty_pgrp */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,7), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.flags */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,8), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.minflt */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,9), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.cmin_flt */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,10), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.maj_flt */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,11), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.cmaj_flt */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,12), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.utime */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,13), KERNEL_ULONG, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) } }, + +/* proc.psinfo.stime */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,14), KERNEL_ULONG, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) } }, + +/* proc.psinfo.cutime */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,15), KERNEL_ULONG, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) } }, + +/* proc.psinfo.cstime */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,16), KERNEL_ULONG, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) } }, + +/* proc.psinfo.priority */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,17), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.nice */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,18), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +#if 0 +/* invalid field */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,19), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0) } }, +#endif + +/* proc.psinfo.it_real_value */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,20), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.start_time */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,21), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) } }, + +/* proc.psinfo.vsize */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,22), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.psinfo.rss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,23), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.psinfo.rss_rlim */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,24), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.psinfo.start_code */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,25), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.end_code */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,26), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.start_stack */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,27), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.esp */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,28), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.eip */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,29), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.signal */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,30), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.blocked */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,31), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.sigignore */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,32), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.sigcatch */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,33), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.wchan */ +#if defined(HAVE_64BIT_PTR) + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,34), PM_TYPE_U64, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, +#elif defined(HAVE_32BIT_PTR) + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,34), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, +#else + error! unsupported pointer size +#endif + +/* proc.psinfo.nswap */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,35), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.cnswap */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,36), PM_TYPE_U32, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.psinfo.exit_signal */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,37), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.processor -- added by Mike Mason <mmlnx@us.ibm.com> */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,38), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.ttyname */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,39), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + +/* proc.psinfo.wchan_s -- added by Mike Mason <mmlnx@us.ibm.com> */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,40), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.psargs -- modified by Mike Mason <mmlnx@us.ibm.com> */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STAT,41), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* + * proc/<pid>/status cluster + * Cluster added by Mike Mason <mmlnx@us.ibm.com> + */ + +/* proc.id.uid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,0), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.euid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,1), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.suid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,2), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.fsuid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,3), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.gid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,4), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.egid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,5), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.sgid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,6), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.fsgid */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,7), PM_TYPE_U32, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.uid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,8), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.euid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,9), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.suid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,10), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.fsuid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,11), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.gid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,12), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.egid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,13), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.sgid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,14), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.id.fsgid_nm */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,15), PM_TYPE_STRING, PROC_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.signal_s */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,16), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.blocked_s */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,17), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.sigignore_s */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,18), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.sigcatch_s */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,19), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.memory.vmsize */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,20), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmlock */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,21), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmrss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,22), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmdata */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,23), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmstack */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,24), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmexe */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,25), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmlib */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,26), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.memory.vmswap */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,27), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0)}}, + +/* proc.psinfo.threads */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATUS,28), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.cgroups */ + { NULL, + { PMDA_PMID(CLUSTER_PID_CGROUP,0), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* proc.psinfo.labels */ + { NULL, + { PMDA_PMID(CLUSTER_PID_LABEL,0), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + + +/* + * proc/<pid>/statm cluster + */ + +/* proc.memory.size */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,0), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.rss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,1), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.share */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,2), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.textrss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,3), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.librss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,4), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.datrss */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,5), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.dirty */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,6), PM_TYPE_U32, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) } }, + +/* proc.memory.maps -- added by Mike Mason <mmlnx@us.ibm.com> */ + { NULL, + { PMDA_PMID(CLUSTER_PID_STATM,7), PM_TYPE_STRING, PROC_INDOM, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,0,0,0,0)}}, + +/* + * proc/<pid>/schedstat cluster + */ + +/* proc.schedstat.cpu_time */ + { NULL, + { PMDA_PMID(CLUSTER_PID_SCHEDSTAT,0), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0)}}, +/* proc.schedstat.run_delay */ + { NULL, + { PMDA_PMID(CLUSTER_PID_SCHEDSTAT,1), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0)}}, +/* proc.schedstat.pcount */ + { NULL, + { PMDA_PMID(CLUSTER_PID_SCHEDSTAT,2), KERNEL_ULONG, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE)}}, + +/* + * proc/<pid>/io cluster + */ +/* proc.io.rchar */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,0), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0)}}, +/* proc.io.wchar */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,1), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0)}}, +/* proc.io.syscr */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,2), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE)}}, +/* proc.io.syscw */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,3), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE)}}, +/* proc.io.read_bytes */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,4), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0)}}, +/* proc.io.write_bytes */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,5), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0)}}, +/* proc.io.cancelled_write_bytes */ + { NULL, + { PMDA_PMID(CLUSTER_PID_IO,6), PM_TYPE_U64, PROC_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0)}}, + +/* + * proc.runq cluster + */ + +/* proc.runq.runnable */ + { &proc_runq.runnable, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 0), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.blocked */ + { &proc_runq.blocked, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 1), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.sleeping */ + { &proc_runq.sleeping, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 2), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.stopped */ + { &proc_runq.stopped, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 3), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.swapped */ + { &proc_runq.swapped, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 4), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.defunct */ + { &proc_runq.defunct, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 5), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.unknown */ + { &proc_runq.unknown, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 6), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* proc.runq.kernel */ + { &proc_runq.kernel, + { PMDA_PMID(CLUSTER_PROC_RUNQ, 7), PM_TYPE_32, PM_INDOM_NULL, PM_SEM_INSTANT, + PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* + * control groups cluster + */ + /* cgroups.subsys.hierarchy */ + { NULL, {PMDA_PMID(CLUSTER_CGROUP_SUBSYS,0), PM_TYPE_U32, + CGROUP_SUBSYS_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* cgroups.subsys.count */ + { NULL, {PMDA_PMID(CLUSTER_CGROUP_SUBSYS,1), PM_TYPE_U32, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroups.mounts.subsys */ + { NULL, {PMDA_PMID(CLUSTER_CGROUP_MOUNTS,0), PM_TYPE_STRING, + CGROUP_MOUNTS_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* cgroups.mounts.count */ + { NULL, {PMDA_PMID(CLUSTER_CGROUP_MOUNTS,1), PM_TYPE_U32, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.cpuset.[<group>.]cpus */ + { NULL, {PMDA_PMID(CLUSTER_CPUSET_GROUPS,0), PM_TYPE_STRING, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* cgroup.groups.cpuset.[<group>.]mems */ + { NULL, {PMDA_PMID(CLUSTER_CPUSET_GROUPS,0), PM_TYPE_STRING, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* cgroup.groups.cpuacct.[<group>.]stat.user */ + { NULL, {PMDA_PMID(CLUSTER_CPUACCT_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) }, }, + + /* cgroup.groups.cpuacct.[<group>.]stat.system */ + { NULL, {PMDA_PMID(CLUSTER_CPUACCT_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) }, }, + + /* cgroup.groups.cpuacct.[<group>.]usage */ + { NULL, {PMDA_PMID(CLUSTER_CPUACCT_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.cpuacct.[<group>.]usage_percpu */ + { NULL, {PMDA_PMID(CLUSTER_CPUACCT_GROUPS,0), PM_TYPE_U64, + CPU_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.cpusched.[<group>.]shares */ + { NULL, {PMDA_PMID(CLUSTER_CPUSCHED_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.cache */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.rss */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.rss_huge */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.mapped_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.writeback */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.swap */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.pgpgin */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.pgpgout */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.pgfault */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.pgmajfault */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.inactive_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.active_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.inactive_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.active_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.unevictable */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_cache */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_rss */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_rss_huge */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_mapped_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_writeback */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_swap */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_pgpgin */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_pgpgout */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_pgfault */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_pgmajfault */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_inactive_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_active_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_inactive_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_active_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.total_unevictable */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.recent_rotated_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.recent_rotated_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.recent_scanned_anon */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.memory.[<group>.]stat.recent_scanned_file */ + { NULL, {PMDA_PMID(CLUSTER_MEMORY_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.netclass.[<group>.]classid */ + { NULL, {PMDA_PMID(CLUSTER_NET_CLS_GROUPS,0), PM_TYPE_U64, + PM_INDOM_NULL, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_merged.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_merged.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_merged.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_merged.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_merged.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_queued.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_queued.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_queued.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_queued.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_queued.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_bytes.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_bytes.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_bytes.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_bytes.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_bytes.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(1,0,0,PM_SPACE_KBYTE,0,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_serviced.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_serviced.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_serviced.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_serviced.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_serviced.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_time.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_time.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_time.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_time.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_service_time.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_wait_time.read */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_wait_time.write */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_wait_time.sync */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_wait_time.async */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]io_wait_time.total */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_NSEC,0) }, }, + + /* cgroup.groups.blkio.[<group>.]sectors */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, }, + + /* cgroup.groups.blkio.[<group>.]time */ + { NULL, {PMDA_PMID(CLUSTER_BLKIO_GROUPS,0), PM_TYPE_U64, + DISK_INDOM, PM_SEM_COUNTER, PMDA_PMUNITS(0,1,0,0,PM_TIME_MSEC,0) }, }, + + +/* + * proc/<pid>/fd cluster + */ + + /* proc.fd.count */ + { NULL, { PMDA_PMID(CLUSTER_PID_FD,0), PM_TYPE_U32, + PROC_INDOM, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) } }, + +/* + * Metrics control cluster + */ + + /* proc.control.all.threads */ + { &threads, { PMDA_PMID(CLUSTER_CONTROL, 1), PM_TYPE_U32, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) } }, + /* proc.control.perclient.threads */ + { NULL, { PMDA_PMID(CLUSTER_CONTROL, 2), PM_TYPE_U32, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) } }, + /* proc.control.perclient.cgroups */ + { NULL, { PMDA_PMID(CLUSTER_CONTROL, 3), PM_TYPE_STRING, + PM_INDOM_NULL, PM_SEM_INSTANT, PMDA_PMUNITS(0,0,0,0,0,0) } }, +}; + +pmInDom +proc_indom(int serial) +{ + return indomtab[serial].it_indom; +} + +FILE * +proc_statsfile(const char *path, char *buffer, int size) +{ + snprintf(buffer, size, "%s%s", proc_statspath, path); + buffer[size-1] = '\0'; + return fopen(buffer, "r"); +} + +static void +proc_refresh(pmdaExt *pmda, int *need_refresh) +{ + int need_refresh_mtab = 0; + + if (need_refresh[CLUSTER_CPUACCT_GROUPS]) + refresh_cgroup_cpus(INDOM(CPU_INDOM)); + + if (need_refresh[CLUSTER_CGROUP_SUBSYS] || + need_refresh[CLUSTER_CGROUP_MOUNTS] || + need_refresh[CLUSTER_CPUSET_GROUPS] || + need_refresh[CLUSTER_CPUACCT_GROUPS] || + need_refresh[CLUSTER_CPUSCHED_GROUPS] || + need_refresh[CLUSTER_BLKIO_GROUPS] || + need_refresh[CLUSTER_NET_CLS_GROUPS] || + need_refresh[CLUSTER_MEMORY_GROUPS]) { + refresh_cgroup_subsys(INDOM(CGROUP_SUBSYS_INDOM)); + need_refresh_mtab |= refresh_cgroups(pmda, NULL); + } + + if (need_refresh_mtab) + pmdaDynamicMetricTable(pmda); + + if (need_refresh[CLUSTER_PID_STAT] || + need_refresh[CLUSTER_PID_STATM] || + need_refresh[CLUSTER_PID_STATUS] || + need_refresh[CLUSTER_PID_IO] || + need_refresh[CLUSTER_PID_LABEL] || + need_refresh[CLUSTER_PID_CGROUP] || + need_refresh[CLUSTER_PID_SCHEDSTAT] || + need_refresh[CLUSTER_PID_FD]) { + refresh_proc_pid(&proc_pid, + proc_ctx_threads(pmda->e_context, threads), + proc_ctx_cgroups(pmda->e_context, cgroups)); + } + + if (need_refresh[CLUSTER_PROC_RUNQ]) + refresh_proc_runq(&proc_runq); +} + +static int +proc_instance(pmInDom indom, int inst, char *name, __pmInResult **result, pmdaExt *pmda) +{ + __pmInDom_int *indomp = (__pmInDom_int *)&indom; + int need_refresh[NUM_CLUSTERS] = { 0 }; + char newname[16]; /* see Note below */ + int sts; + + switch (indomp->serial) { + case CPU_INDOM: + /* + * Used by cgroup.groups.cpuacct.[<group>.]usage_percpu + * and cgroup.groups.cpuacct.usage_percpu + */ + need_refresh[CLUSTER_CPUACCT_GROUPS]++; + break; + case DISK_INDOM: + need_refresh[CLUSTER_BLKIO_GROUPS]++; + break; + case PROC_INDOM: + need_refresh[CLUSTER_PID_STAT]++; + need_refresh[CLUSTER_PID_STATM]++; + need_refresh[CLUSTER_PID_STATUS]++; + need_refresh[CLUSTER_PID_LABEL]++; + need_refresh[CLUSTER_PID_CGROUP]++; + need_refresh[CLUSTER_PID_SCHEDSTAT]++; + need_refresh[CLUSTER_PID_IO]++; + need_refresh[CLUSTER_PID_FD]++; + break; + case CGROUP_SUBSYS_INDOM: + need_refresh[CLUSTER_CGROUP_SUBSYS]++; + break; + case CGROUP_MOUNTS_INDOM: + need_refresh[CLUSTER_CGROUP_MOUNTS]++; + break; + /* no default label : pmdaInstance will pick up errors */ + } + + if (indomp->serial == PROC_INDOM && inst == PM_IN_NULL && name != NULL) { + /* + * For the proc indom, if the name is a pid (as a string), and it + * contains only digits (i.e. it's not a full instance name) then + * reformat it to be exactly six digits, with leading zeros. + * + * Note that although format %06d is used here and in proc_pid.c, + * the pid could be longer than this (in which case there + * are no leading zeroes. The size of newname[] is chosen + * to comfortably accommodate a 32-bit pid (Linux maximum), + * or max value of 4294967295 (10 digits) + */ + char *p; + for (p = name; *p != '\0'; p++) { + if (!isdigit((int)*p)) + break; + } + if (*p == '\0') { + snprintf(newname, sizeof(newname), "%06d", atoi(name)); + name = newname; + } + } + + sts = PM_ERR_PERMISSION; + have_access = proc_ctx_access(pmda->e_context) || all_access; + if (have_access || indomp->serial != PROC_INDOM) { + proc_refresh(pmda, need_refresh); + sts = pmdaInstance(indom, inst, name, result, pmda); + } + have_access = proc_ctx_revert(pmda->e_context); + + return sts; +} + +/* + * callback provided to pmdaFetch + */ + +static int +proc_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom) +{ + __pmID_int *idp = (__pmID_int *)&(mdesc->m_desc.pmid); + int cluster = proc_pmid_cluster(mdesc->m_desc.pmid); + int sts; + unsigned long ul; + const char *cp; + char *f; + int *ip; + proc_pid_entry_t *entry; + void *fsp; + static long hz = -1; + char *tail; + + if (hz == -1) + hz = sysconf(_SC_CLK_TCK); + + if (mdesc->m_user != NULL) { + /* + * The metric value is extracted directly via the address specified + * in metrictab. Note: not all metrics support this - those that + * don't have NULL for the m_user field in their respective + * metrictab slot. + */ + + switch (mdesc->m_desc.type) { + case PM_TYPE_32: + atom->l = *(__int32_t *)mdesc->m_user; + break; + case PM_TYPE_U32: + atom->ul = *(__uint32_t *)mdesc->m_user; + break; + case PM_TYPE_64: + atom->ll = *(__int64_t *)mdesc->m_user; + break; + case PM_TYPE_U64: + atom->ull = *(__uint64_t *)mdesc->m_user; + break; + case PM_TYPE_FLOAT: + atom->f = *(float *)mdesc->m_user; + break; + case PM_TYPE_DOUBLE: + atom->d = *(double *)mdesc->m_user; + break; + case PM_TYPE_STRING: + cp = *(char **)mdesc->m_user; + atom->cp = (char *)(cp ? cp : ""); + break; + default: + return 0; + } + } + else + switch (cluster) { + + case CLUSTER_PID_STAT: + if (idp->item == 99) /* proc.nprocs */ + atom->ul = proc_pid.indom->it_numinst; + else { + static char ttyname[MAXPATHLEN]; + + if (!have_access) + return PM_ERR_PERMISSION; + if ((entry = fetch_proc_pid_stat(inst, &proc_pid)) == NULL) + return PM_ERR_INST; + + switch (idp->item) { + + + case PROC_PID_STAT_PID: + atom->ul = entry->id; + break; + + case PROC_PID_STAT_TTYNAME: + if ((f = _pm_getfield(entry->stat_buf, PROC_PID_STAT_TTY)) == NULL) + atom->cp = "?"; + else { + dev_t dev = (dev_t)atoi(f); + atom->cp = get_ttyname_info(inst, dev, ttyname); + } + break; + + case PROC_PID_STAT_CMD: + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->cp = f + 1; + atom->cp[strlen(atom->cp)-1] = '\0'; + break; + + case PROC_PID_STAT_PSARGS: + atom->cp = entry->name + 7; + break; + + case PROC_PID_STAT_STATE: + /* + * string + */ + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->cp = f; + break; + + case PROC_PID_STAT_VSIZE: + case PROC_PID_STAT_RSS_RLIM: + /* + * bytes converted to kbytes + */ + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + atom->ul /= 1024; + break; + + case PROC_PID_STAT_RSS: + /* + * pages converted to kbytes + */ + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + atom->ul *= _pm_system_pagesize / 1024; + break; + + case PROC_PID_STAT_UTIME: + case PROC_PID_STAT_STIME: + case PROC_PID_STAT_CUTIME: + case PROC_PID_STAT_CSTIME: + /* + * unsigned jiffies converted to unsigned milliseconds + */ + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + + ul = (__uint32_t)strtoul(f, &tail, 0); + _pm_assign_ulong(atom, 1000 * (double)ul / hz); + break; + + case PROC_PID_STAT_PRIORITY: + case PROC_PID_STAT_NICE: + /* + * signed decimal int + */ + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->l = (__int32_t)strtol(f, &tail, 0); + break; + + case PROC_PID_STAT_WCHAN: + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; +#if defined(HAVE_64BIT_PTR) + atom->ull = (__uint64_t)strtoull(f, &tail, 0); +#else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); +#endif + break; + + case PROC_PID_STAT_WCHAN_SYMBOL: + if (entry->wchan_buf) /* 2.6 kernel, /proc/<pid>/wchan */ + atom->cp = entry->wchan_buf; + else { /* old school (2.4 kernels, at least) */ + char *wc; + /* + * Convert address to symbol name if requested + * Added by Mike Mason <mmlnx@us.ibm.com> + */ + f = _pm_getfield(entry->stat_buf, PROC_PID_STAT_WCHAN); + if (f == NULL) + return PM_ERR_INST; +#if defined(HAVE_64BIT_PTR) + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + if ((wc = wchan(atom->ull))) + atom->cp = wc; + else + atom->cp = atom->ull ? f : ""; +#else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + if ((wc = wchan((__psint_t)atom->ul))) + atom->cp = wc; + else + atom->cp = atom->ul ? f : ""; +#endif + } + break; + + default: + /* + * unsigned decimal int + */ + if (idp->item < NR_PROC_PID_STAT) { + if ((f = _pm_getfield(entry->stat_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + } + else + return PM_ERR_PMID; + break; + } + } + break; + + case CLUSTER_PID_STATM: + if (!have_access) + return PM_ERR_PERMISSION; + if (idp->item == PROC_PID_STATM_MAPS) { /* proc.memory.maps */ + if ((entry = fetch_proc_pid_maps(inst, &proc_pid)) == NULL) + return PM_ERR_INST; + atom->cp = entry->maps_buf; + } else { + if ((entry = fetch_proc_pid_statm(inst, &proc_pid)) == NULL) + return PM_ERR_INST; + + if (idp->item <= PROC_PID_STATM_DIRTY) { + /* unsigned int */ + if ((f = _pm_getfield(entry->statm_buf, idp->item)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + atom->ul *= _pm_system_pagesize / 1024; + } + else + return PM_ERR_PMID; + } + break; + + case CLUSTER_PID_SCHEDSTAT: + if (!have_access) + return PM_ERR_PERMISSION; + if ((entry = fetch_proc_pid_schedstat(inst, &proc_pid)) == NULL) + return (oserror() == ENOENT) ? PM_ERR_APPVERSION : PM_ERR_INST; + + if (idp->item < NR_PROC_PID_SCHED) { + if ((f = _pm_getfield(entry->schedstat_buf, idp->item)) == NULL) + return PM_ERR_INST; + if (idp->item == PROC_PID_SCHED_PCOUNT && + mdesc->m_desc.type == PM_TYPE_U32) + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + else +#if defined(HAVE_64BIT_PTR) + atom->ull = (__uint64_t)strtoull(f, &tail, 0); +#else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); +#endif + } + else + return PM_ERR_PMID; + break; + + case CLUSTER_PID_IO: + if (!have_access) + return PM_ERR_PERMISSION; + if ((entry = fetch_proc_pid_io(inst, &proc_pid)) == NULL) + return (oserror() == ENOENT) ? PM_ERR_APPVERSION : PM_ERR_INST; + + switch (idp->item) { + + case PROC_PID_IO_RCHAR: + if ((f = _pm_getfield(entry->io_lines.rchar, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_WCHAR: + if ((f = _pm_getfield(entry->io_lines.wchar, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_SYSCR: + if ((f = _pm_getfield(entry->io_lines.syscr, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_SYSCW: + if ((f = _pm_getfield(entry->io_lines.syscw, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_READ_BYTES: + if ((f = _pm_getfield(entry->io_lines.readb, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_WRITE_BYTES: + if ((f = _pm_getfield(entry->io_lines.writeb, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + case PROC_PID_IO_CANCELLED_BYTES: + if ((f = _pm_getfield(entry->io_lines.cancel, 1)) == NULL) + atom->ull = 0; + else + atom->ull = (__uint64_t)strtoull(f, &tail, 0); + break; + + default: + return PM_ERR_PMID; + } + break; + + /* + * Cluster added by Mike Mason <mmlnx@us.ibm.com> + */ + case CLUSTER_PID_STATUS: + if (!have_access) + return PM_ERR_PERMISSION; + if ((entry = fetch_proc_pid_status(inst, &proc_pid)) == NULL) + return PM_ERR_INST; + + switch (idp->item) { + + case PROC_PID_STATUS_UID: + case PROC_PID_STATUS_EUID: + case PROC_PID_STATUS_SUID: + case PROC_PID_STATUS_FSUID: + case PROC_PID_STATUS_UID_NM: + case PROC_PID_STATUS_EUID_NM: + case PROC_PID_STATUS_SUID_NM: + case PROC_PID_STATUS_FSUID_NM: + { + struct passwd *pwe; + + if ((f = _pm_getfield(entry->status_lines.uid, (idp->item % 4) + 1)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + if (idp->item > PROC_PID_STATUS_FSUID) { + if ((pwe = getpwuid((uid_t)atom->ul)) != NULL) + atom->cp = pwe->pw_name; + else + atom->cp = "UNKNOWN"; + } + } + break; + + case PROC_PID_STATUS_GID: + case PROC_PID_STATUS_EGID: + case PROC_PID_STATUS_SGID: + case PROC_PID_STATUS_FSGID: + case PROC_PID_STATUS_GID_NM: + case PROC_PID_STATUS_EGID_NM: + case PROC_PID_STATUS_SGID_NM: + case PROC_PID_STATUS_FSGID_NM: + { + struct group *gre; + + if ((f = _pm_getfield(entry->status_lines.gid, (idp->item % 4) + 1)) == NULL) + return PM_ERR_INST; + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + if (idp->item > PROC_PID_STATUS_FSGID) { + if ((gre = getgrgid((gid_t)atom->ul)) != NULL) { + atom->cp = gre->gr_name; + } else { + atom->cp = "UNKNOWN"; + } + } + } + break; + + case PROC_PID_STATUS_SIGNAL: + if ((atom->cp = _pm_getfield(entry->status_lines.sigpnd, 1)) == NULL) + return PM_ERR_INST; + break; + + case PROC_PID_STATUS_BLOCKED: + if ((atom->cp = _pm_getfield(entry->status_lines.sigblk, 1)) == NULL) + return PM_ERR_INST; + break; + + case PROC_PID_STATUS_SIGCATCH: + if ((atom->cp = _pm_getfield(entry->status_lines.sigcgt, 1)) == NULL) + return PM_ERR_INST; + break; + + case PROC_PID_STATUS_SIGIGNORE: + if ((atom->cp = _pm_getfield(entry->status_lines.sigign, 1)) == NULL) + return PM_ERR_INST; + break; + + case PROC_PID_STATUS_VMSIZE: + if ((f = _pm_getfield(entry->status_lines.vmsize, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMLOCK: + if ((f = _pm_getfield(entry->status_lines.vmlck, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMRSS: + if ((f = _pm_getfield(entry->status_lines.vmrss, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMDATA: + if ((f = _pm_getfield(entry->status_lines.vmdata, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMSTACK: + if ((f = _pm_getfield(entry->status_lines.vmstk, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMEXE: + if ((f = _pm_getfield(entry->status_lines.vmexe, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMLIB: + if ((f = _pm_getfield(entry->status_lines.vmlib, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_VMSWAP: + if ((f = _pm_getfield(entry->status_lines.vmswap, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + case PROC_PID_STATUS_THREADS: + if ((f = _pm_getfield(entry->status_lines.threads, 1)) == NULL) + atom->ul = 0; + else + atom->ul = (__uint32_t)strtoul(f, &tail, 0); + break; + + default: + return PM_ERR_PMID; + } + break; + + case CLUSTER_CGROUP_SUBSYS: + switch (idp->item) { + case 0: /* cgroup.subsys.hierarchy */ + sts = pmdaCacheLookup(INDOM(CGROUP_SUBSYS_INDOM), inst, NULL, (void **)&ip); + if (sts < 0) + return sts; + if (sts != PMDA_CACHE_ACTIVE) + return PM_ERR_INST; + atom->ul = *ip; + break; + + case 1: /* cgroup.subsys.count */ + atom->ul = pmdaCacheOp(INDOM(CGROUP_SUBSYS_INDOM), PMDA_CACHE_SIZE_ACTIVE); + break; + } + break; + + case CLUSTER_CGROUP_MOUNTS: + switch (idp->item) { + case 0: /* cgroup.mounts.subsys */ + sts = pmdaCacheLookup(INDOM(CGROUP_MOUNTS_INDOM), inst, NULL, &fsp); + if (sts < 0) + return sts; + if (sts != PMDA_CACHE_ACTIVE) + return PM_ERR_INST; + atom->cp = cgroup_find_subsys(INDOM(CGROUP_SUBSYS_INDOM), fsp); + break; + + case 1: /* cgroup.mounts.count */ + atom->ul = pmdaCacheOp(INDOM(CGROUP_MOUNTS_INDOM), PMDA_CACHE_SIZE_ACTIVE); + break; + } + break; + + case CLUSTER_CPUSET_GROUPS: + case CLUSTER_CPUACCT_GROUPS: + case CLUSTER_CPUSCHED_GROUPS: + case CLUSTER_MEMORY_GROUPS: + case CLUSTER_NET_CLS_GROUPS: + case CLUSTER_BLKIO_GROUPS: + return cgroup_group_fetch(mdesc->m_desc.pmid, inst, atom); + + case CLUSTER_PID_FD: + if (!have_access) + return PM_ERR_PERMISSION; + if (idp->item > PROC_PID_FD_COUNT) + return PM_ERR_PMID; + if ((entry = fetch_proc_pid_fd(inst, &proc_pid)) == NULL) + return PM_ERR_INST; + atom->ul = entry->fd_count; + break; + + case CLUSTER_PID_CGROUP: + if (!have_access) + return PM_ERR_PERMISSION; + if (idp->item > PROC_PID_CGROUP) + return PM_ERR_PMID; + if ((entry = fetch_proc_pid_cgroup(inst, &proc_pid)) == NULL) { + if (oserror() == ENOENT) return PM_ERR_APPVERSION; + if (oserror() != ENODATA) return PM_ERR_INST; + atom->cp = ""; + } else { + atom->cp = proc_strings_lookup(entry->cgroup_id); + } + break; + + case CLUSTER_PID_LABEL: + if (!have_access) + return PM_ERR_PERMISSION; + if (idp->item > PROC_PID_LABEL) + return PM_ERR_PMID; + if ((entry = fetch_proc_pid_label(inst, &proc_pid)) == NULL) { + if (oserror() == ENOENT) return PM_ERR_APPVERSION; + if (oserror() != ENODATA) return PM_ERR_INST; + atom->cp = ""; + } else { + atom->cp = proc_strings_lookup(entry->label_id); + } + break; + + case CLUSTER_CONTROL: + switch (idp->item) { + /* case 1: not reached -- proc.control.all.threads is direct */ + case 2: /* proc.control.perclient.threads */ + atom->ul = proc_ctx_threads(pmdaGetContext(), threads); + break; + case 3: /* proc.control.perclient.cgroups */ + cp = proc_ctx_cgroups(pmdaGetContext(), cgroups); + atom->cp = (char *)(cp ? cp : ""); + break; + default: + return PM_ERR_PMID; + } + break; + + default: /* unknown cluster */ + return PM_ERR_PMID; + } + + return PMDA_FETCH_STATIC; +} + +static int +proc_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda) +{ + int i, sts, cluster; + int need_refresh[NUM_CLUSTERS] = { 0 }; + + for (i = 0; i < numpmid; i++) { + cluster = proc_pmid_cluster(pmidlist[i]); + if (cluster >= MIN_CLUSTER && cluster < NUM_CLUSTERS) + need_refresh[cluster]++; + } + + have_access = proc_ctx_access(pmda->e_context) || all_access; + proc_refresh(pmda, need_refresh); + sts = pmdaFetch(numpmid, pmidlist, resp, pmda); + have_access = proc_ctx_revert(pmda->e_context); + return sts; +} + +static int +proc_store(pmResult *result, pmdaExt *pmda) +{ + int i, sts = 0; + + have_access = proc_ctx_access(pmda->e_context) || all_access; + + for (i = 0; i < result->numpmid; i++) { + pmValueSet *vsp = result->vset[i]; + __pmID_int *idp = (__pmID_int *)&(vsp->pmid); + pmAtomValue av; + + if (idp->cluster != CLUSTER_CONTROL) + sts = PM_ERR_PERMISSION; + else if (vsp->numval != 1) + sts = PM_ERR_INST; + else switch (idp->item) { + case 1: /* proc.control.all.threads */ + if (!have_access) + sts = PM_ERR_PERMISSION; + else if ((sts = pmExtractValue(vsp->valfmt, &vsp->vlist[0], + PM_TYPE_U32, &av, PM_TYPE_U32)) >= 0) { + if (av.ul > 1) /* only zero or one allowed */ + sts = PM_ERR_CONV; + else + threads = av.ul; + } + break; + case 2: /* proc.control.perclient.threads */ + if ((sts = pmExtractValue(vsp->valfmt, &vsp->vlist[0], + PM_TYPE_U32, &av, PM_TYPE_U32)) >= 0) { + sts = proc_ctx_set_threads(pmda->e_context, av.ul); + } + break; + case 3: /* proc.control.perclient.cgroups */ + if ((sts = pmExtractValue(vsp->valfmt, &vsp->vlist[0], + PM_TYPE_STRING, &av, PM_TYPE_STRING)) >= 0) { + if ((sts = proc_ctx_set_cgroups(pmda->e_context, av.cp)) < 0) + free(av.cp); + } + break; + default: + sts = PM_ERR_PERMISSION; + } + if (sts < 0) + break; + } + + have_access = proc_ctx_revert(pmda->e_context); + return sts; +} + +static int +proc_text(int ident, int type, char **buf, pmdaExt *pmda) +{ + if ((type & PM_TEXT_PMID) == PM_TEXT_PMID) { + int sts = pmdaDynamicLookupText(ident, type, buf, pmda); + if (sts != -ENOENT) + return sts; + } + return pmdaText(ident, type, buf, pmda); +} + +static int +proc_pmid(const char *name, pmID *pmid, pmdaExt *pmda) +{ + pmdaNameSpace *tree = pmdaDynamicLookupName(pmda, name); + if (tree == NULL) + return PM_ERR_NAME; + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "proc_pmid: name=%s tree:\n", name); + __pmDumpNameNode(stderr, tree->root, 1); + } + return pmdaTreePMID(tree, name, pmid); +} + +static int +proc_name(pmID pmid, char ***nameset, pmdaExt *pmda) +{ + pmdaNameSpace *tree = pmdaDynamicLookupPMID(pmda, pmid); + if (tree == NULL) + return PM_ERR_PMID; + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "proc_name: pmid=%s tree:\n", pmIDStr(pmid)); + __pmDumpNameNode(stderr, tree->root, 1); + } + return pmdaTreeName(tree, pmid, nameset); +} + +static int +proc_children(const char *name, int flag, char ***kids, int **sts, pmdaExt *pmda) +{ + pmdaNameSpace *tree = pmdaDynamicLookupName(pmda, name); + if (tree == NULL) + return PM_ERR_NAME; + if (pmDebug & DBG_TRACE_APPL2) { + fprintf(stderr, "proc_children: name=%s flag=%d tree:\n", name, flag); + __pmDumpNameNode(stderr, tree->root, 1); + } + return pmdaTreeChildren(tree, name, flag, kids, sts); +} + +/* + * Helper routines for accessing a generic static string dictionary + */ + +char * +proc_strings_lookup(int index) +{ + char *value; + pmInDom dict = INDOM(STRINGS_INDOM); + + if (pmdaCacheLookup(dict, index, &value, NULL) == PMDA_CACHE_ACTIVE) + return value; + return ""; +} + +int +proc_strings_insert(const char *buf) +{ + pmInDom dict = INDOM(STRINGS_INDOM); + return pmdaCacheStore(dict, PMDA_CACHE_ADD, buf, NULL); +} + +/* + * Initialise the agent (both daemon and DSO). + */ + +void +__PMDA_INIT_CALL +proc_init(pmdaInterface *dp) +{ + int nindoms = sizeof(indomtab)/sizeof(indomtab[0]); + int nmetrics = sizeof(metrictab)/sizeof(metrictab[0]); + char *envpath; + + _pm_system_pagesize = getpagesize(); + if ((envpath = getenv("PROC_STATSPATH")) != NULL) + proc_statspath = envpath; + + if (_isDSO) { + char helppath[MAXPATHLEN]; + int sep = __pmPathSeparator(); + snprintf(helppath, sizeof(helppath), "%s%c" "proc" "%c" "help", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); + pmdaDSO(dp, PMDA_INTERFACE_6, "proc DSO", helppath); + } + + if (dp->status != 0) + return; + dp->comm.flags |= PDU_FLAG_AUTH; + + dp->version.six.instance = proc_instance; + dp->version.six.store = proc_store; + dp->version.six.fetch = proc_fetch; + dp->version.six.text = proc_text; + dp->version.six.pmid = proc_pmid; + dp->version.six.name = proc_name; + dp->version.six.children = proc_children; + dp->version.six.attribute = proc_ctx_attrs; + pmdaSetEndContextCallBack(dp, proc_ctx_end); + pmdaSetFetchCallBack(dp, proc_fetchCallBack); + + /* + * Initialize the instance domain table. + */ + indomtab[CPU_INDOM].it_indom = CPU_INDOM; + indomtab[DISK_INDOM].it_indom = DISK_INDOM; + indomtab[DEVT_INDOM].it_indom = DEVT_INDOM; + indomtab[PROC_INDOM].it_indom = PROC_INDOM; + indomtab[STRINGS_INDOM].it_indom = STRINGS_INDOM; + indomtab[CGROUP_SUBSYS_INDOM].it_indom = CGROUP_SUBSYS_INDOM; + indomtab[CGROUP_MOUNTS_INDOM].it_indom = CGROUP_MOUNTS_INDOM; + + proc_pid.indom = &indomtab[PROC_INDOM]; + + /* + * Read System.map and /proc/ksyms. Used to translate wait channel + * addresses to symbol names. + * Added by Mike Mason <mmlnx@us.ibm.com> + */ + read_ksym_sources(kernel_uname.release); + + cgroup_init(metrictab, nmetrics); + proc_ctx_init(); + + pmdaSetFlags(dp, PMDA_EXT_FLAG_HASHED); + pmdaInit(dp, indomtab, nindoms, metrictab, nmetrics); + + /* string metrics use the pmdaCache API for value indexing */ + pmdaCacheOp(INDOM(STRINGS_INDOM), PMDA_CACHE_STRINGS); + + /* cgroup metrics use the pmdaCache API for indom indexing */ + pmdaCacheOp(INDOM(CPU_INDOM), PMDA_CACHE_CULL); + pmdaCacheOp(INDOM(DISK_INDOM), PMDA_CACHE_CULL); + pmdaCacheOp(INDOM(CGROUP_SUBSYS_INDOM), PMDA_CACHE_CULL); + pmdaCacheOp(INDOM(CGROUP_MOUNTS_INDOM), PMDA_CACHE_CULL); +} + +pmLongOptions longopts[] = { + PMDA_OPTIONS_HEADER("Options"), + PMOPT_DEBUG, + { "no-access-checks", 0, 'A', 0, "no access checks will be performed (insecure, beware!)" }, + PMDAOPT_DOMAIN, + PMDAOPT_LOGFILE, + { "with-threads", 0, 'L', 0, "include threads in the all-processes instance domain" }, + { "from-cgroup", 1, 'r', "NAME", "restrict monitoring to processes in the named cgroup" }, + PMDAOPT_USERNAME, + PMOPT_HELP, + PMDA_OPTIONS_END +}; + +pmdaOptions opts = { + .short_options = "AD:d:l:Lr:U:?", + .long_options = longopts, +}; + +int +main(int argc, char **argv) +{ + int c, sep = __pmPathSeparator(); + pmdaInterface dispatch; + char helppath[MAXPATHLEN]; + char *username = "root"; + + _isDSO = 0; + __pmSetProgname(argv[0]); + snprintf(helppath, sizeof(helppath), "%s%c" "proc" "%c" "help", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); + pmdaDaemon(&dispatch, PMDA_INTERFACE_6, pmProgname, PROC, "proc.log", helppath); + + while ((c = pmdaGetOptions(argc, argv, &opts, &dispatch)) != EOF) { + switch (c) { + case 'A': + all_access = 1; + break; + case 'L': + threads = 1; + break; + case 'r': + cgroups = opts.optarg; + break; + } + } + + if (opts.errors) { + pmdaUsageMessage(&opts); + exit(1); + } + if (opts.username) + username = opts.username; + + pmdaOpenLog(&dispatch); + __pmSetProcessIdentity(username); + + proc_init(&dispatch); + pmdaConnect(&dispatch); + pmdaMain(&dispatch); + exit(0); +} diff --git a/src/pmdas/linux_proc/proc_pid.c b/src/pmdas/linux_proc/proc_pid.c new file mode 100644 index 0000000..152d96c --- /dev/null +++ b/src/pmdas/linux_proc/proc_pid.c @@ -0,0 +1,957 @@ +/* + * Linux proc/<pid>/{stat,statm,status,...} Clusters + * + * Copyright (c) 2013 Red Hat. + * Copyright (c) 2000,2004,2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2010 Aconex. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include <ctype.h> +#include <dirent.h> +#include <sys/stat.h> +#include "proc_pid.h" +#include "indom.h" + +static proc_pid_list_t pids; + +static int +compare_pid(const void *pa, const void *pb) +{ + int a = *(int *)pa; + int b = *(int *)pb; + return a - b; +} + +static void +pidlist_append_pid(int pid) +{ + if (pids.count >= pids.size) { + pids.size += 64; + if (!(pids.pids = (int *)realloc(pids.pids, pids.size * sizeof(int)))) { + perror("pidlist_append: out of memory"); + pids.size = pids.count = 0; + return; /* soldier on bravely */ + } + } + pids.pids[pids.count++] = pid; +} + +static void +pidlist_append(const char *pidname) +{ + pidlist_append_pid(atoi(pidname)); +} + +static void +tasklist_append(const char *pid) +{ + DIR *taskdirp; + struct dirent *tdp; + char taskpath[1024]; + + sprintf(taskpath, "%s/proc/%s/task", proc_statspath, pid); + if ((taskdirp = opendir(taskpath)) != NULL) { + while ((tdp = readdir(taskdirp)) != NULL) { + if (!isdigit((int)tdp->d_name[0]) || strcmp(pid, tdp->d_name) == 0) + continue; + pidlist_append(tdp->d_name); + } + closedir(taskdirp); + } +} + +static int +refresh_cgroup_pidlist(int want_threads, const char *cgroup) +{ + char path[MAXPATHLEN]; + FILE *fp; + int pid; + + /* + * We're running in cgroups mode where a subset of the processes is + * going to be returned based on the cgroup specified earlier via a + * store into the proc.control.{all,perclient}.cgroups metric. + * + * Use the "cgroup.procs" or "tasks" file depending on want_threads. + * Note that both these files are already sorted, ascending numeric. + */ + if (want_threads) + snprintf(path, sizeof(path), "%s%s/tasks", proc_statspath, cgroup); + else + snprintf(path, sizeof(path), "%s%s/cgroup.procs", proc_statspath, cgroup); + + if ((fp = fopen(path, "r")) != NULL) { + while (fscanf(fp, "%d\n", &pid) == 1) + pidlist_append_pid(pid); + fclose(fp); + } + return 0; +} + +static int +refresh_global_pidlist(int want_threads) +{ + DIR *dirp; + struct dirent *dp; + char path[MAXPATHLEN]; + + snprintf(path, sizeof(path), "%s/proc", proc_statspath); + if ((dirp = opendir(path)) == NULL) + return -oserror(); + + /* note: readdir on /proc ignores threads */ + while ((dp = readdir(dirp)) != NULL) { + if (isdigit((int)dp->d_name[0])) { + pidlist_append(dp->d_name); + if (want_threads) + tasklist_append(dp->d_name); + } + } + closedir(dirp); + + qsort(pids.pids, pids.count, sizeof(int), compare_pid); + return 0; +} + +static void +refresh_proc_pidlist(proc_pid_t *proc_pid) +{ + int i; + int fd; + char *p; + char buf[MAXPATHLEN]; + __pmHashNode *node, *next, *prev; + proc_pid_entry_t *ep; + pmdaIndom *indomp = proc_pid->indom; + + if (indomp->it_numinst < pids.count) + indomp->it_set = (pmdaInstid *)realloc(indomp->it_set, + pids.count * sizeof(pmdaInstid)); + indomp->it_numinst = pids.count; + + /* + * invalidate all entries so we can harvest pids that have exited + */ + for (i=0; i < proc_pid->pidhash.hsize; i++) { + for (node=proc_pid->pidhash.hash[i]; node != NULL; node = node->next) { + ep = (proc_pid_entry_t *)node->data; + ep->flags = 0; + } + } + + /* + * walk pid list and add new pids to the hash table, + * marking entries valid as we go ... + */ + for (i=0; i < pids.count; i++) { + node = __pmHashSearch(pids.pids[i], &proc_pid->pidhash); + if (node == NULL) { + int k = 0; + + ep = (proc_pid_entry_t *)malloc(sizeof(proc_pid_entry_t)); + memset(ep, 0, sizeof(proc_pid_entry_t)); + + ep->id = pids.pids[i]; + + snprintf(buf, sizeof(buf), "%s/proc/%d/cmdline", proc_statspath, pids.pids[i]); + if ((fd = open(buf, O_RDONLY)) >= 0) { + sprintf(buf, "%06d ", pids.pids[i]); + if ((k = read(fd, buf+7, sizeof(buf)-8)) > 0) { + p = buf + k +7; + *p-- = '\0'; + /* Skip trailing nils, i.e. don't replace them */ + while (buf+7 < p) { + if (*p-- != '\0') { + break; + } + } + /* Remove NULL terminators from cmdline string array */ + /* Suggested by Mike Mason <mmlnx@us.ibm.com> */ + while (buf+7 < p) { + if (*p == '\0') *p = ' '; + p--; + } + } + close(fd); + } + + if (k == 0) { + /* + * If a process is swapped out, /proc/<pid>/cmdline + * returns an empty string so we have to get it + * from /proc/<pid>/status or /proc/<pid>/stat + */ + sprintf(buf, "%s/proc/%d/status", proc_statspath, pids.pids[i]); + if ((fd = open(buf, O_RDONLY)) >= 0) { + /* We engage in a bit of a hanky-panky here: + * the string should look like "123456 (name)", + * we get it from /proc/XX/status as "Name: name\n...", + * to fit the 6 digits of PID and opening parenthesis, + * save 2 bytes at the start of the buffer. + * And don't forget to leave 2 bytes for the trailing + * parenthesis and the nil. Here is + * an example of what we're trying to achieve: + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | | | N| a| m| e| :|\t| i| n| i| t|\n| S|... + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | 0| 0| 0| 0| 0| 1| | (| i| n| i| t| )|\0|... + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+ */ + if ((k = read(fd, buf+2, sizeof(buf)-4)) > 0) { + int bc; + + if ((p = strchr(buf+2, '\n')) == NULL) + p = buf+k; + p[0] = ')'; + p[1] = '\0'; + bc = sprintf(buf, "%06d ", pids.pids[i]); + buf[bc] = '('; + } + close(fd); + } + } + + if (k <= 0) { + /* hmm .. must be exiting */ + sprintf(buf, "%06d <exiting>", pids.pids[i]); + } + + ep->name = strdup(buf); + + __pmHashAdd(pids.pids[i], (void *)ep, &proc_pid->pidhash); + // fprintf(stderr, "## ADDED \"%s\" to hash table\n", buf); + } + else + ep = (proc_pid_entry_t *)node->data; + + /* mark pid as still existing */ + ep->flags |= PROC_PID_FLAG_VALID; + + /* refresh the indom pointer */ + indomp->it_set[i].i_inst = ep->id; + indomp->it_set[i].i_name = ep->name; + } + + /* + * harvest exited pids from the pid hash table + */ + for (i=0; i < proc_pid->pidhash.hsize; i++) { + for (prev=NULL, node=proc_pid->pidhash.hash[i]; node != NULL;) { + next = node->next; + ep = (proc_pid_entry_t *)node->data; + // fprintf(stderr, "CHECKING key=%d node=" PRINTF_P_PFX "%p prev=" PRINTF_P_PFX "%p next=" PRINTF_P_PFX "%p ep=" PRINTF_P_PFX "%p valid=%d\n", + // ep->id, node, prev, node->next, ep, ep->valid); + if (!(ep->flags & PROC_PID_FLAG_VALID)) { + // fprintf(stderr, "DELETED key=%d name=\"%s\"\n", ep->id, ep->name); + if (ep->name != NULL) + free(ep->name); + if (ep->stat_buf != NULL) + free(ep->stat_buf); + if (ep->status_buf != NULL) + free(ep->status_buf); + if (ep->statm_buf != NULL) + free(ep->statm_buf); + if (ep->maps_buf != NULL) + free(ep->maps_buf); + if (ep->schedstat_buf != NULL) + free(ep->schedstat_buf); + if (ep->io_buf != NULL) + free(ep->io_buf); + if (ep->wchan_buf != NULL) + free(ep->wchan_buf); + + if (prev == NULL) + proc_pid->pidhash.hash[i] = node->next; + else + prev->next = node->next; + free(ep); + free(node); + } + else { + prev = node; + } + if ((node = next) == NULL) + break; + } + } +} + +int +refresh_proc_pid(proc_pid_t *proc_pid, int threads, const char *cgroups) +{ + int sts; + + pids.count = 0; + pids.threads = threads; + + sts = (cgroups && cgroups[0] != '\0') ? + refresh_cgroup_pidlist(threads, cgroups) : + refresh_global_pidlist(threads); + if (sts < 0) + return sts; + +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_LIBPMDA) + fprintf(stderr, + "refresh_proc_pid: %d pids (threads=%d, cgroups=\"%s\")\n", + sts, threads, cgroups ? cgroups : ""); +#endif + + refresh_proc_pidlist(proc_pid); + return 0; +} + + +/* + * Open a proc file, taking into account that we may want thread info + * rather than process information. + * + * We make (ab)use of some obscure Linux procfs mechanisms here! + * Even though readdir(/proc) does not contain tasks, we can still open + * taskid directory files; on top of that, the tasks sub-directory in a + * task group has all (peer) tasks in that group, even for "children". + */ +static int +proc_open(const char *base, proc_pid_entry_t *ep) +{ + int fd; + char buf[128]; + + if (pids.threads) { + sprintf(buf, "%s/proc/%d/task/%d/%s", proc_statspath, ep->id, ep->id, base); + if ((fd = open(buf, O_RDONLY)) >= 0) + return fd; + /* fallback to /proc path if task path open fails */ + } + sprintf(buf, "%s/proc/%d/%s", proc_statspath, ep->id, base); + return open(buf, O_RDONLY); +} + +static DIR * +proc_opendir(const char *base, proc_pid_entry_t *ep) +{ + DIR *dir; + char buf[128]; + + if (pids.threads) { + sprintf(buf, "%s/proc/%d/task/%d/%s", proc_statspath, ep->id, ep->id, base); + if ((dir = opendir(buf)) != NULL) + return dir; + /* fallback to /proc path if task path opendir fails */ + } + sprintf(buf, "%s/proc/%d/%s", proc_statspath, ep->id, base); + return opendir(buf); +} + +/* + * fetch a proc/<pid>/stat entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_stat(int id, proc_pid_t *proc_pid) +{ + int fd; + int sts = 0; + int n; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + char buf[1024]; + + if (node == NULL) { +#if PCP_DEBUG + if ((pmDebug & (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) == (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) { + char ibuf[1024]; + fprintf(stderr, "fetch_proc_pid_stat: __pmHashSearch(%d, hash[%s]) -> NULL\n", id, pmInDomStr_r(proc_pid->indom->it_indom, ibuf, sizeof(ibuf))); + } +#endif + return NULL; + } + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_STAT_FETCHED)) { + if ((fd = proc_open("stat", ep)) < 0) { + sts = -oserror(); +#if PCP_DEBUG + if ((pmDebug & (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) == (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) { + char ibuf[1024]; + char ebuf[1024]; + fprintf(stderr, "fetch_proc_pid_stat: proc_open(\"stat\", ...) failed: id=%d, indom=%s, sts=%s\n", id, pmInDomStr_r(proc_pid->indom->it_indom, ibuf, sizeof(ibuf)), pmErrStr_r(sts, ebuf, sizeof(ebuf))); + } +#endif + } + else { + if ((n = read(fd, buf, sizeof(buf))) < 0) { + sts = -oserror(); +#if PCP_DEBUG + if ((pmDebug & (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) == (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) { + char ibuf[1024]; + char ebuf[1024]; + fprintf(stderr, "fetch_proc_pid_stat: read \"stat\" failed: id=%d, indom=%s, sts=%s\n", id, pmInDomStr_r(proc_pid->indom->it_indom, ibuf, sizeof(ibuf)), pmErrStr_r(sts, ebuf, sizeof(ebuf))); + } +#endif + } + else { + if (n == 0) { + /* eh? */ + sts = -1; +#if PCP_DEBUG + if ((pmDebug & (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) == (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) { + char ibuf[1024]; + fprintf(stderr, "fetch_proc_pid_stat: read \"stat\" EOF?: id=%d, indom=%s\n", id, pmInDomStr_r(proc_pid->indom->it_indom, ibuf, sizeof(ibuf))); + } +#endif + } + else { + if (ep->stat_buflen <= n) { + ep->stat_buflen = n; + ep->stat_buf = (char *)realloc(ep->stat_buf, n); + } + memcpy(ep->stat_buf, buf, n); + ep->stat_buf[n-1] = '\0'; + sts = 0; + } + } + } + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_STAT_FETCHED; + } + + if (!(ep->flags & PROC_PID_FLAG_WCHAN_FETCHED)) { + if ((fd = proc_open("wchan", ep)) < 0) { + /* ignore failure here, backwards compat */ + ; + } + else { + if ((n = read(fd, buf, sizeof(buf)-1)) < 0) { + sts = -oserror(); +#if PCP_DEBUG + if ((pmDebug & (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) == (DBG_TRACE_LIBPMDA|DBG_TRACE_DESPERATE)) { + char ibuf[1024]; + char ebuf[1024]; + fprintf(stderr, "fetch_proc_pid_stat: read \"wchan\" failed: id=%d, indom=%s, sts=%s\n", id, pmInDomStr_r(proc_pid->indom->it_indom, ibuf, sizeof(ibuf)), pmErrStr_r(sts, ebuf, sizeof(ebuf))); + } +#endif + } + else { + if (n == 0) { + /* wchan is empty, nothing to add here */ + ; + } + else { + n++; /* no terminating null (from kernel) */ + if (ep->wchan_buflen <= n) { + ep->wchan_buflen = n; + ep->wchan_buf = (char *)realloc(ep->wchan_buf, n); + } + memcpy(ep->wchan_buf, buf, n-1); + ep->wchan_buf[n-1] = '\0'; + } + } + } + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_WCHAN_FETCHED; + } + + if (sts < 0) + return NULL; + return ep; +} + +/* + * fetch a proc/<pid>/status entry for pid + * Added by Mike Mason <mmlnx@us.ibm.com> + */ +proc_pid_entry_t * +fetch_proc_pid_status(int id, proc_pid_t *proc_pid) +{ + int sts = 0; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_STATUS_FETCHED)) { + int fd; + int n; + char buf[1024]; + char *curline; + + if ((fd = proc_open("status", ep)) < 0) + sts = -oserror(); + else if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) + sts = -1; + else { + if (ep->status_buflen < n) { + ep->status_buflen = n; + ep->status_buf = (char *)realloc(ep->status_buf, n); + } + + if (ep->status_buf == NULL) + sts = -1; + else { + memcpy(ep->status_buf, buf, n); + ep->status_buf[n-1] = '\0'; + } + } + } + + if (sts == 0) { + /* assign pointers to individual lines in buffer */ + curline = ep->status_buf; + + while (strncmp(curline, "Uid:", 4)) { + curline = index(curline, '\n') + 1; + } + + /* user & group IDs */ + ep->status_lines.uid = strsep(&curline, "\n"); + ep->status_lines.gid = strsep(&curline, "\n"); + + while (curline) { + if (strncmp(curline, "VmSize:", 7) == 0) { + /* memory info - these lines don't exist for kernel threads */ + ep->status_lines.vmsize = strsep(&curline, "\n"); + ep->status_lines.vmlck = strsep(&curline, "\n"); + if (strncmp(curline, "VmRSS:", 6) != 0) + curline = index(curline, '\n') + 1; // Have VmPin: ? + if (strncmp(curline, "VmRSS:", 6) != 0) + curline = index(curline, '\n') + 1; // Have VmHWM: ? + ep->status_lines.vmrss = strsep(&curline, "\n"); + ep->status_lines.vmdata = strsep(&curline, "\n"); + ep->status_lines.vmstk = strsep(&curline, "\n"); + ep->status_lines.vmexe = strsep(&curline, "\n"); + ep->status_lines.vmlib = strsep(&curline, "\n"); + curline = index(curline, '\n') + 1; // skip VmPTE + ep->status_lines.vmswap = strsep(&curline, "\n"); + ep->status_lines.threads = strsep(&curline, "\n"); + } else + if (strncmp(curline, "SigPnd:", 7) == 0) { + /* signal masks */ + ep->status_lines.sigpnd = strsep(&curline, "\n"); + ep->status_lines.sigblk = strsep(&curline, "\n"); + ep->status_lines.sigign = strsep(&curline, "\n"); + ep->status_lines.sigcgt = strsep(&curline, "\n"); + break; /* we're done */ + } else { + curline = index(curline, '\n') + 1; + } + } + } + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_STATUS_FETCHED; + } + + return (sts < 0) ? NULL : ep; +} + +/* + * fetch a proc/<pid>/statm entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_statm(int id, proc_pid_t *proc_pid) +{ + int sts = 0; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_STATM_FETCHED)) { + char buf[1024]; + int fd, n; + + if ((fd = proc_open("statm", ep)) < 0) + sts = -oserror(); + else + if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) + /* eh? */ + sts = -1; + else { + if (ep->statm_buflen <= n) { + ep->statm_buflen = n; + ep->statm_buf = (char *)realloc(ep->statm_buf, n); + } + memcpy(ep->statm_buf, buf, n); + ep->statm_buf[n-1] = '\0'; + } + } + + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_STATM_FETCHED; + } + + return (sts < 0) ? NULL : ep; +} + + +/* + * fetch a proc/<pid>/maps entry for pid + * WARNING: This can be very large! Only ask for it if you really need it. + * Added by Mike Mason <mmlnx@us.ibm.com> + */ +proc_pid_entry_t * +fetch_proc_pid_maps(int id, proc_pid_t *proc_pid) +{ + int sts = 0; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + char *maps_bufptr = NULL; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_MAPS_FETCHED)) { + int fd; + + if ((fd = proc_open("maps", ep)) < 0) + sts = -oserror(); + else { + char buf[1024]; + int n, len = 0; + + while ((n = read(fd, buf, sizeof(buf))) > 0) { + len += n; + if (ep->maps_buflen <= len) { + ep->maps_buflen = len + 1; + ep->maps_buf = (char *)realloc(ep->maps_buf, ep->maps_buflen); + } + maps_bufptr = ep->maps_buf + len - n; + memcpy(maps_bufptr, buf, n); + } + ep->flags |= PROC_PID_FLAG_MAPS_FETCHED; + /* If there are no maps, make maps_buf point to a zero length string. */ + if (ep->maps_buflen == 0) { + ep->maps_buf = (char *)malloc(1); + ep->maps_buflen = 1; + } + ep->maps_buf[ep->maps_buflen - 1] = '\0'; + close(fd); + } + } + + return (sts < 0) ? NULL : ep; +} + +/* + * fetch a proc/<pid>/schedstat entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_schedstat(int id, proc_pid_t *proc_pid) +{ + int sts = 0; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_SCHEDSTAT_FETCHED)) { + int fd, n; + char buf[1024]; + + if ((fd = proc_open("schedstat", ep)) < 0) + sts = -oserror(); + else + if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) + /* eh? */ + sts = -1; + else { + if (ep->schedstat_buflen <= n) { + ep->schedstat_buflen = n; + ep->schedstat_buf = (char *)realloc(ep->schedstat_buf, n); + } + memcpy(ep->schedstat_buf, buf, n); + ep->schedstat_buf[n-1] = '\0'; + } + } + if (fd >= 0) { + close(fd); + } + ep->flags |= PROC_PID_FLAG_SCHEDSTAT_FETCHED; + } + + return (sts < 0) ? NULL : ep; +} + +/* + * fetch a proc/<pid>/io entry for pid + * + * Depends on kernel built with CONFIG_TASK_IO_ACCOUNTING=y + * which means the following must also be set: + * CONFIG_TASKSTATS=y + * CONFIG_TASK_DELAY_ACCT=y + * CONFIG_TASK_XACCT=y + */ +proc_pid_entry_t * +fetch_proc_pid_io(int id, proc_pid_t *proc_pid) +{ + int sts = 0; + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_IO_FETCHED)) { + int fd, n; + char buf[1024]; + char *curline; + + if ((fd = proc_open("io", ep)) < 0) + sts = -oserror(); + else if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) + sts = -1; + else { + if (ep->io_buflen < n) { + ep->io_buflen = n; + ep->io_buf = (char *)realloc(ep->io_buf, n); + } + + if (ep->io_buf == NULL) + sts = -1; + else { + memcpy(ep->io_buf, buf, n); + ep->io_buf[n-1] = '\0'; + } + } + } + + if (sts == 0) { + /* assign pointers to individual lines in buffer */ + curline = ep->io_buf; + ep->io_lines.rchar = strsep(&curline, "\n"); + ep->io_lines.wchar = strsep(&curline, "\n"); + ep->io_lines.syscr = strsep(&curline, "\n"); + ep->io_lines.syscw = strsep(&curline, "\n"); + ep->io_lines.readb = strsep(&curline, "\n"); + ep->io_lines.writeb = strsep(&curline, "\n"); + ep->io_lines.cancel = strsep(&curline, "\n"); + ep->flags |= PROC_PID_FLAG_IO_FETCHED; + } + if (fd >= 0) + close(fd); + } + + return (sts < 0) ? NULL : ep; +} + +/* + * fetch a proc/<pid>/fd entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_fd(int id, proc_pid_t *proc_pid) +{ + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_FD_FETCHED)) { + uint32_t de_count = 0; + DIR *dir = proc_opendir("fd", ep); + + if (dir == NULL) { +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_LIBPMDA) + fprintf(stderr, "failed to open fd path for pid %d\n", ep->id); +#endif + return NULL; + } + while (readdir(dir) != NULL) { + de_count++; + } + closedir(dir); + ep->fd_count = de_count - 2; /* subtract cwd and parent entries */ + ep->flags |= PROC_PID_FLAG_FD_FETCHED; + } + + return ep; +} + +/* + * From the kernel format for a single process cgroup set: + * 2:cpu:/ + * 1:cpuset:/ + * + * Produce the same one-line format string that "ps" uses: + * "cpu:/;cpuset:/" + */ +static void +proc_cgroup_reformat(char *buf, int len, char *fmt) +{ + char *target = fmt, *p, *s = NULL; + + *target = '\0'; + for (p = buf; p - buf < len; p++) { + if (*p == '\0') + break; + if (*p == ':' && !s) /* position "s" at start */ + s = p + 1; + if (*p != '\n' || !s) /* find end of this line */ + continue; + if (target != fmt) /* not the first cgroup? */ + strncat(target, ";", 2); + /* have a complete cgroup line now, copy it over */ + strncat(target, s, (p - s)); + target += (p - s); + s = NULL; /* reset it for new line */ + } +} + +/* + * fetch a proc/<pid>/cgroup entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_cgroup(int id, proc_pid_t *proc_pid) +{ + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + int sts = 0; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_CGROUP_FETCHED)) { + char buf[1024]; + char fmt[1024]; + int n, fd; + + if ((fd = proc_open("cgroup", ep)) < 0) + sts = -oserror(); + else if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) { + setoserror(ENODATA); + sts = -1; + } + else { + /* reformat the buffer to match "ps" output format, then hash */ + proc_cgroup_reformat(&buf[0], n, &fmt[0]); + ep->cgroup_id = proc_strings_insert(fmt); + } + } + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_CGROUP_FETCHED; + } + + return (sts < 0) ? NULL : ep; +} + +/* + * fetch a proc/<pid>/attr/current entry for pid + */ +proc_pid_entry_t * +fetch_proc_pid_label(int id, proc_pid_t *proc_pid) +{ + __pmHashNode *node = __pmHashSearch(id, &proc_pid->pidhash); + proc_pid_entry_t *ep; + int sts = 0; + + if (node == NULL) + return NULL; + ep = (proc_pid_entry_t *)node->data; + + if (!(ep->flags & PROC_PID_FLAG_LABEL_FETCHED)) { + char buf[1024]; + int n, fd; + + if ((fd = proc_open("attr/current", ep)) < 0) + sts = -oserror(); + else if ((n = read(fd, buf, sizeof(buf))) < 0) + sts = -oserror(); + else { + if (n == 0) { + setoserror(ENODATA); + sts = -1; + } else { + /* buffer matches "ps" output format, direct hash */ + buf[sizeof(buf)-1] = '\0'; + ep->label_id = proc_strings_insert(buf); + } + } + if (fd >= 0) + close(fd); + ep->flags |= PROC_PID_FLAG_LABEL_FETCHED; + } + + return (sts < 0) ? NULL : ep; +} + +/* + * Extract the ith (space separated) field from a char buffer. + * The first field starts at zero. + * BEWARE: return copy is in a static buffer. + */ +char * +_pm_getfield(char *buf, int field) +{ + static int retbuflen = 0; + static char *retbuf = NULL; + char *p; + int i; + + if (buf == NULL) + return NULL; + + for (p=buf, i=0; i < field; i++) { + /* skip to the next space */ + for (; *p && !isspace((int)*p); p++) {;} + + /* skip to the next word */ + for (; *p && isspace((int)*p); p++) {;} + } + + /* return a null terminated copy of the field */ + for (i=0; ; i++) { + if (isspace((int)p[i]) || p[i] == '\0' || p[i] == '\n') + break; + } + + if (i >= retbuflen) { + retbuflen = i+4; + retbuf = (char *)realloc(retbuf, retbuflen); + } + memcpy(retbuf, p, i); + retbuf[i] = '\0'; + + return retbuf; +} diff --git a/src/pmdas/linux_proc/proc_pid.h b/src/pmdas/linux_proc/proc_pid.h new file mode 100644 index 0000000..8835157 --- /dev/null +++ b/src/pmdas/linux_proc/proc_pid.h @@ -0,0 +1,289 @@ +/* + * Linux /proc/<pid>/... Clusters + * + * Copyright (c) 2013 Red Hat. + * Copyright (c) 2000,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#ifndef _PROC_PID_H +#define _PROC_PID_H + +/* + * /proc/<pid>/stat metrics + */ +#define PROC_PID_STAT_PID 0 +#define PROC_PID_STAT_CMD 1 +#define PROC_PID_STAT_STATE 2 +#define PROC_PID_STAT_PPID 3 +#define PROC_PID_STAT_PGRP 4 +#define PROC_PID_STAT_SESSION 5 +#define PROC_PID_STAT_TTY 6 +#define PROC_PID_STAT_TTY_PGRP 7 +#define PROC_PID_STAT_FLAGS 8 +#define PROC_PID_STAT_MINFLT 9 +#define PROC_PID_STAT_CMIN_FLT 10 +#define PROC_PID_STAT_MAJ_FLT 11 +#define PROC_PID_STAT_CMAJ_FLT 12 +#define PROC_PID_STAT_UTIME 13 +#define PROC_PID_STAT_STIME 14 +#define PROC_PID_STAT_CUTIME 15 +#define PROC_PID_STAT_CSTIME 16 +#define PROC_PID_STAT_PRIORITY 17 +#define PROC_PID_STAT_NICE 18 +#define PROC_PID_STAT_REMOVED 19 +#define PROC_PID_STAT_IT_REAL_VALUE 20 +#define PROC_PID_STAT_START_TIME 21 +#define PROC_PID_STAT_VSIZE 22 +#define PROC_PID_STAT_RSS 23 +#define PROC_PID_STAT_RSS_RLIM 24 +#define PROC_PID_STAT_START_CODE 25 +#define PROC_PID_STAT_END_CODE 26 +#define PROC_PID_STAT_START_STACK 27 +#define PROC_PID_STAT_ESP 28 +#define PROC_PID_STAT_EIP 29 +#define PROC_PID_STAT_SIGNAL 30 +#define PROC_PID_STAT_BLOCKED 31 +#define PROC_PID_STAT_SIGIGNORE 32 +#define PROC_PID_STAT_SIGCATCH 33 +#define PROC_PID_STAT_WCHAN 34 +#define PROC_PID_STAT_NSWAP 35 +#define PROC_PID_STAT_CNSWAP 36 +#define PROC_PID_STAT_EXIT_SIGNAL 37 +#define PROC_PID_STAT_PROCESSOR 38 +#define PROC_PID_STAT_TTYNAME 39 +#define PROC_PID_STAT_WCHAN_SYMBOL 40 +#define PROC_PID_STAT_PSARGS 41 + +/* number of fields in proc_pid_stat_entry_t */ +#define NR_PROC_PID_STAT 42 + +/* + * metrics in /proc/<pid>/status + * Added by Mike Mason <mmlnx@us.ibm.com> + */ +#define PROC_PID_STATUS_UID 0 +#define PROC_PID_STATUS_EUID 1 +#define PROC_PID_STATUS_SUID 2 +#define PROC_PID_STATUS_FSUID 3 +#define PROC_PID_STATUS_GID 4 +#define PROC_PID_STATUS_EGID 5 +#define PROC_PID_STATUS_SGID 6 +#define PROC_PID_STATUS_FSGID 7 +#define PROC_PID_STATUS_UID_NM 8 +#define PROC_PID_STATUS_EUID_NM 9 +#define PROC_PID_STATUS_SUID_NM 10 +#define PROC_PID_STATUS_FSUID_NM 11 +#define PROC_PID_STATUS_GID_NM 12 +#define PROC_PID_STATUS_EGID_NM 13 +#define PROC_PID_STATUS_SGID_NM 14 +#define PROC_PID_STATUS_FSGID_NM 15 +#define PROC_PID_STATUS_SIGNAL 16 +#define PROC_PID_STATUS_BLOCKED 17 +#define PROC_PID_STATUS_SIGIGNORE 18 +#define PROC_PID_STATUS_SIGCATCH 19 +#define PROC_PID_STATUS_VMSIZE 20 +#define PROC_PID_STATUS_VMLOCK 21 +#define PROC_PID_STATUS_VMRSS 22 +#define PROC_PID_STATUS_VMDATA 23 +#define PROC_PID_STATUS_VMSTACK 24 +#define PROC_PID_STATUS_VMEXE 25 +#define PROC_PID_STATUS_VMLIB 26 +#define PROC_PID_STATUS_VMSWAP 27 +#define PROC_PID_STATUS_THREADS 28 + +/* number of metrics from /proc/<pid>/status */ +#define NR_PROC_PID_STATUS 27 + +/* + * metrics in /proc/<pid>/statm & /proc/<pid>/maps + */ +#define PROC_PID_STATM_SIZE 0 +#define PROC_PID_STATM_RSS 1 +#define PROC_PID_STATM_SHARE 2 +#define PROC_PID_STATM_TEXTRS 3 +#define PROC_PID_STATM_LIBRS 4 +#define PROC_PID_STATM_DATRS 5 +#define PROC_PID_STATM_DIRTY 6 +#define PROC_PID_STATM_MAPS 7 + +/* number of fields in proc_pid_statm_entry_t */ +#define NR_PROC_PID_STATM 8 + +/* + * metrics in /proc/<pid>/schedstat + */ +#define PROC_PID_SCHED_CPUTIME 0 +#define PROC_PID_SCHED_RUNDELAY 1 +#define PROC_PID_SCHED_PCOUNT 2 +#define NR_PROC_PID_SCHED 3 + +/* + * metrics in /proc/<pid>/io + */ +#define PROC_PID_IO_RCHAR 0 +#define PROC_PID_IO_WCHAR 1 +#define PROC_PID_IO_SYSCR 2 +#define PROC_PID_IO_SYSCW 3 +#define PROC_PID_IO_READ_BYTES 4 +#define PROC_PID_IO_WRITE_BYTES 5 +#define PROC_PID_IO_CANCELLED_BYTES 6 + +/* + * metrics in /proc/<pid>/fd + */ +#define PROC_PID_FD_COUNT 0 + + +/* + * metrics in /proc/<pid>/cgroup + */ +#define PROC_PID_CGROUP 0 + +/* + * metrics in /proc/<pid>/attr/current + */ +#define PROC_PID_LABEL 0 + +typedef struct { /* /proc/<pid>/status */ + char *uid; + char *gid; + char *sigpnd; + char *sigblk; + char *sigign; + char *sigcgt; + char *vmsize; + char *vmlck; + char *vmrss; + char *vmdata; + char *vmstk; + char *vmexe; + char *vmlib; + char *vmswap; + char *threads; +} status_lines_t; + +typedef struct { /* /proc/<pid>/io */ + char *rchar; + char *wchar; + char *syscr; + char *syscw; + char *readb; + char *writeb; + char *cancel; +} io_lines_t; + +enum { + PROC_PID_FLAG_VALID = 1<<0, + PROC_PID_FLAG_STAT_FETCHED = 1<<1, + PROC_PID_FLAG_STATM_FETCHED = 1<<2, + PROC_PID_FLAG_MAPS_FETCHED = 1<<3, + PROC_PID_FLAG_STATUS_FETCHED = 1<<4, + PROC_PID_FLAG_SCHEDSTAT_FETCHED = 1<<5, + PROC_PID_FLAG_IO_FETCHED = 1<<6, + PROC_PID_FLAG_WCHAN_FETCHED = 1<<7, + PROC_PID_FLAG_FD_FETCHED = 1<<8, + PROC_PID_FLAG_CGROUP_FETCHED = 1<<9, + PROC_PID_FLAG_LABEL_FETCHED = 1<<10, +}; + +typedef struct { + int id; /* pid, hash key and internal instance id */ + int flags; /* combinations of PROC_PID_FLAG_* values */ + char *name; /* external instance name (<pid> cmdline) */ + + /* /proc/<pid>/stat cluster */ + int stat_buflen; + char *stat_buf; + + /* /proc/<pid>/statm and /proc/<pid>/maps cluster */ + int statm_buflen; + char *statm_buf; + int maps_buflen; + char *maps_buf; + + /* /proc/<pid>/status cluster */ + int status_buflen; + char *status_buf; + status_lines_t status_lines; + + /* /proc/<pid>/schedstat cluster */ + int schedstat_buflen; + char *schedstat_buf; + + /* /proc/<pid>/io cluster */ + int io_buflen; + char *io_buf; + io_lines_t io_lines; + + /* /proc/<pid>/wchan cluster */ + int wchan_buflen; + char *wchan_buf; + + /* /proc/<pid>/fd cluster */ + int fd_buflen; + uint32_t fd_count; + char *fd_buf; + + /* /proc/<pid>/cgroup cluster */ + int cgroup_id; + + /* /proc/<pid>/attr/current cluster */ + int label_id; +} proc_pid_entry_t; + +typedef struct { + __pmHashCtl pidhash; /* hash table for current pids */ + pmdaIndom *indom; /* instance domain table */ +} proc_pid_t; + +typedef struct { + int count; /* number of processes in the list */ + int size; /* size of the buffer (pids) allocated */ + int *pids; /* array of process identifiers */ + int threads; /* /proc/PID/{xxx,task/PID/xxx} flag */ +} proc_pid_list_t; + +/* refresh the proc indom, reset all "fetched" flags */ +extern int refresh_proc_pid(proc_pid_t *, int, const char *); + +/* fetch a proc/<pid>/stat entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_stat(int, proc_pid_t *); + +/* fetch a proc/<pid>/statm entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_statm(int, proc_pid_t *); + +/* fetch a proc/<pid>/status entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_status(int, proc_pid_t *); + +/* fetch a proc/<pid>/maps entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_maps(int, proc_pid_t *); + +/* fetch a proc/<pid>/schedstat entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_schedstat(int, proc_pid_t *); + +/* fetch a proc/<pid>/io entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_io(int, proc_pid_t *); + +/* fetch a proc/<pid>/fd entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_fd(int, proc_pid_t *); + +/* fetch a proc/<pid>/cgroup entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_cgroup(int, proc_pid_t *); + +/* fetch a proc/<pid>/attr/current entry for pid */ +extern proc_pid_entry_t *fetch_proc_pid_label(int, proc_pid_t *); + +/* extract the ith space separated field from a buffer */ +extern char *_pm_getfield(char *, int); + +#endif /* _PROC_PID_H */ diff --git a/src/pmdas/linux_proc/proc_runq.c b/src/pmdas/linux_proc/proc_runq.c new file mode 100644 index 0000000..07b68dc --- /dev/null +++ b/src/pmdas/linux_proc/proc_runq.c @@ -0,0 +1,123 @@ +/* + * Linux /proc/runq metrics cluster + * + * Copyright (c) 2000,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include <ctype.h> +#include <dirent.h> +#include <sys/stat.h> +#include "proc_pid.h" +#include "proc_runq.h" + +int +refresh_proc_runq(proc_runq_t *proc_runq) +{ + int sz; + int fd; + char *p; + int sname; + DIR *dir; + struct dirent *d; + char fullpath[MAXPATHLEN]; + char buf[4096]; + + memset(proc_runq, 0, sizeof(proc_runq_t)); + if ((dir = opendir("/proc")) == NULL) + return -oserror(); + + while((d = readdir(dir)) != NULL) { + if (!isdigit((int)d->d_name[0])) + continue; + sprintf(fullpath, "/proc/%s/stat", d->d_name); + if ((fd = open(fullpath, O_RDONLY)) < 0) + continue; + sz = read(fd, buf, sizeof(buf)); + close(fd); + buf[sizeof(buf)-1] = '\0'; + + /* + * defunct (state name is 'Z') + */ + if (sz <= 0 || (p = _pm_getfield(buf, PROC_PID_STAT_STATE)) == NULL) { + proc_runq->unknown++; + continue; + } + if ((sname = *p) == 'Z') { + proc_runq->defunct++; + continue; + } + + /* + * kernel process (not defunct and virtual size is zero) + */ + if ((p = _pm_getfield(buf, PROC_PID_STAT_VSIZE)) == NULL) { + proc_runq->unknown++; + continue; + } + if (strcmp(p, "0") == 0) { + proc_runq->kernel++; + continue; + } + + /* + * swapped (resident set size is zero) + */ + if ((p = _pm_getfield(buf, PROC_PID_STAT_RSS)) == NULL) { + proc_runq->unknown++; + continue; + } + if (strcmp(p, "0") == 0) { + proc_runq->swapped++; + continue; + } + + /* + * All other states + */ + switch (sname) { + case 'R': + proc_runq->runnable++; + break; + case 'S': + proc_runq->sleeping++; + break; + case 'T': + proc_runq->stopped++; + break; + case 'D': + proc_runq->blocked++; + break; + /* case 'Z': + break; -- already counted above */ + default: + fprintf(stderr, "UNKNOWN %c : %s\n", sname, buf); + proc_runq->unknown++; + break; + } + } + closedir(dir); + +#if PCP_DEBUG + if (pmDebug & DBG_TRACE_LIBPMDA) { + fprintf(stderr, "refresh_runq: runnable=%d sleeping=%d stopped=%d blocked=%d unknown=%d\n", + proc_runq->runnable, proc_runq->sleeping, proc_runq->stopped, + proc_runq->blocked, proc_runq->unknown); + } +#endif + + return 0; +} diff --git a/src/pmdas/linux_proc/proc_runq.h b/src/pmdas/linux_proc/proc_runq.h new file mode 100644 index 0000000..9739208 --- /dev/null +++ b/src/pmdas/linux_proc/proc_runq.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2000,2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _PROC_RUNQ_H +#define _PROC_RUNQ_H + +typedef struct { + int runnable; + int blocked; + int sleeping; + int stopped; + int swapped; + int kernel; + int defunct; + int unknown; +} proc_runq_t; + +extern int refresh_proc_runq(proc_runq_t *); + +#endif /* _PROC_RUNQ_H */ diff --git a/src/pmdas/linux_proc/root b/src/pmdas/linux_proc/root new file mode 100644 index 0000000..5f26a89 --- /dev/null +++ b/src/pmdas/linux_proc/root @@ -0,0 +1,6 @@ +/* + * fake "root" for validating the local PMNS subtree + */ + +#include <stdpmid> +#include "root_xfs" diff --git a/src/pmdas/linux_proc/root_proc b/src/pmdas/linux_proc/root_proc new file mode 100644 index 0000000..91b8654 --- /dev/null +++ b/src/pmdas/linux_proc/root_proc @@ -0,0 +1,181 @@ +/* + * Metrics for the Linux proc PMDA + * + * Note: + * names and pmids migrated from the Linux PMDA, with the domain + * number changed from LINUX (60) to 3 (3) + */ + +#ifndef PROC +#define PROC 3 +#endif + +root { + cgroup + proc +} + +cgroup { + subsys + mounts + groups PROC:*:* +} + +cgroup.subsys { + hierarchy PROC:37:0 + count PROC:37:1 +} + +cgroup.mounts { + subsys PROC:38:0 + count PROC:38:1 +} + +proc { + nprocs PROC:8:99 + psinfo + memory + runq + id + io + schedstat + fd + control +} + +proc.psinfo { + pid PROC:8:0 + cmd PROC:8:1 + sname PROC:8:2 + ppid PROC:8:3 + pgrp PROC:8:4 + session PROC:8:5 + tty PROC:8:6 + tty_pgrp PROC:8:7 + flags PROC:8:8 + minflt PROC:8:9 + cmin_flt PROC:8:10 + maj_flt PROC:8:11 + cmaj_flt PROC:8:12 + utime PROC:8:13 + stime PROC:8:14 + cutime PROC:8:15 + cstime PROC:8:16 + priority PROC:8:17 + nice PROC:8:18 + /* not valid in 2.2.1 PROC:8:19 */ + it_real_value PROC:8:20 + start_time PROC:8:21 + vsize PROC:8:22 + rss PROC:8:23 + rss_rlim PROC:8:24 + start_code PROC:8:25 + end_code PROC:8:26 + start_stack PROC:8:27 + esp PROC:8:28 + eip PROC:8:29 + signal PROC:8:30 + blocked PROC:8:31 + sigignore PROC:8:32 + sigcatch PROC:8:33 + wchan PROC:8:34 + nswap PROC:8:35 + cnswap PROC:8:36 + exit_signal PROC:8:37 + processor PROC:8:38 + ttyname PROC:8:39 + wchan_s PROC:8:40 + psargs PROC:8:41 + signal_s PROC:24:16 + blocked_s PROC:24:17 + sigignore_s PROC:24:18 + sigcatch_s PROC:24:19 + threads PROC:24:28 + cgroups PROC:11:0 + labels PROC:12:0 +} + +proc.id { + uid PROC:24:0 + euid PROC:24:1 + suid PROC:24:2 + fsuid PROC:24:3 + gid PROC:24:4 + egid PROC:24:5 + sgid PROC:24:6 + fsgid PROC:24:7 + uid_nm PROC:24:8 + euid_nm PROC:24:9 + suid_nm PROC:24:10 + fsuid_nm PROC:24:11 + gid_nm PROC:24:12 + egid_nm PROC:24:13 + sgid_nm PROC:24:14 + fsgid_nm PROC:24:15 +} + +proc.memory { + size PROC:9:0 + rss PROC:9:1 + share PROC:9:2 + textrss PROC:9:3 + librss PROC:9:4 + datrss PROC:9:5 + dirty PROC:9:6 + maps PROC:9:7 + vmsize PROC:24:20 + vmlock PROC:24:21 + vmrss PROC:24:22 + vmdata PROC:24:23 + vmstack PROC:24:24 + vmexe PROC:24:25 + vmlib PROC:24:26 + vmswap PROC:24:27 +} + +proc.runq { + runnable PROC:13:0 + blocked PROC:13:1 + sleeping PROC:13:2 + stopped PROC:13:3 + swapped PROC:13:4 + defunct PROC:13:5 + unknown PROC:13:6 + kernel PROC:13:7 +} + +proc.io { + rchar PROC:32:0 + wchar PROC:32:1 + syscr PROC:32:2 + syscw PROC:32:3 + read_bytes PROC:32:4 + write_bytes PROC:32:5 + cancelled_write_bytes PROC:32:6 +} + +proc.schedstat { + cpu_time PROC:31:0 + run_delay PROC:31:1 + pcount PROC:31:2 +} + +proc.fd { + count PROC:51:0 +} + +proc.control { + all + perclient +} + +proc.control.all { + threads PROC:10:1 +} + +proc.control.perclient { + threads PROC:10:2 + cgroups PROC:10:3 +} + +#undef PROC |