summaryrefslogtreecommitdiff
path: root/src/pmdas/linux_proc/cgroups.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/pmdas/linux_proc/cgroups.c')
-rw-r--r--src/pmdas/linux_proc/cgroups.c1146
1 files changed, 1146 insertions, 0 deletions
diff --git a/src/pmdas/linux_proc/cgroups.c b/src/pmdas/linux_proc/cgroups.c
new file mode 100644
index 0000000..4994465
--- /dev/null
+++ b/src/pmdas/linux_proc/cgroups.c
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (c) 2012-2014 Red Hat.
+ * Copyright (c) 2010 Aconex. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "pmapi.h"
+#include "impl.h"
+#include "pmda.h"
+#include "indom.h"
+#include "cgroups.h"
+#include "clusters.h"
+#include "proc_pid.h"
+#include <sys/stat.h>
+#include <ctype.h>
+
+#define CGROUP_ROOT "cgroup.groups" /* root dynamic PMNS node */
+
+/* Add namespace entries and prepare values for one cgroupfs directory entry */
+struct cgroup_subsys;
+typedef int (*cgroup_prepare_t)(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+static int prepare_ull(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+static int prepare_string(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+static int prepare_named_ull(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+static int prepare_block_ull(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+static int prepare_blocks_ull(__pmnsTree *, const char *,
+ struct cgroup_subsys *, const char *, int, int, int);
+
+/*
+ * Critical data structures for cgroup subsystem in pmdaproc ...
+ * Initial comment for each struct talks about lifecycle of that
+ * data, in terms of what pmdaproc must do with it (esp. memory
+ * allocation related).
+ */
+
+typedef struct { /* contents depends on individual kernel cgroups */
+ int item; /* PMID == domain:cluster:[id:item] */
+ int dynamic; /* do we need an extra free (string) */
+ cgroup_prepare_t prepare; /* setup metric name(s) and value(s) */
+ char *suffix; /* cpus/mems/rss/... */
+} cgroup_metrics_t;
+
+typedef struct { /* some metrics are multi-valued, but most have only one */
+ int item; /* PMID == domain:cluster:[id:item] */
+ int atom_count;
+ pmAtomValue *atoms;
+} cgroup_values_t;
+
+typedef struct { /* contains data for each group users have created, if any */
+ int id; /* PMID == domain:cluster:[id:item] */
+ int refreshed; /* boolean: are values all uptodate */
+ proc_pid_list_t process_list;
+ cgroup_values_t *metric_values;
+} cgroup_group_t;
+
+typedef struct cgroup_subsys { /* contents covers the known kernel cgroups */
+ const char *name; /* cpuset/memory/... */
+ int cluster; /* PMID == domain:cluster:[id:item] */
+ int group_count; /* number of groups (dynamic) */
+ int metric_count; /* number of metrics (fixed) */
+ time_t previous_time; /* used to avoid repeated refresh */
+ cgroup_group_t *groups; /* array of groups (dynamic) */
+ cgroup_metrics_t *metrics; /* array of metrics (fixed) */
+} cgroup_subsys_t;
+
+static cgroup_metrics_t cpusched_metrics[] = {
+ { .suffix = "shares", .prepare = prepare_ull },
+};
+
+static cgroup_metrics_t cpuacct_metrics[] = {
+ { .suffix = "stat.user", .prepare = prepare_named_ull },
+ { .suffix = "stat.system", .prepare = prepare_named_ull },
+ { .suffix = "usage", .prepare = prepare_ull },
+ { .suffix = "usage_percpu", .prepare = prepare_ull },
+};
+
+static cgroup_metrics_t cpuset_metrics[] = {
+ { .suffix = "io_merged", .prepare = prepare_string },
+ { .suffix = "sectors", .prepare = prepare_string },
+};
+
+static cgroup_metrics_t memory_metrics[] = {
+ { .suffix = "stat.cache", .prepare = prepare_named_ull },
+ { .suffix = "stat.rss", .prepare = prepare_named_ull },
+ { .suffix = "stat.rss_huge", .prepare = prepare_named_ull },
+ { .suffix = "stat.mapped_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.writeback", .prepare = prepare_named_ull },
+ { .suffix = "stat.swap", .prepare = prepare_named_ull },
+ { .suffix = "stat.pgpgin", .prepare = prepare_named_ull },
+ { .suffix = "stat.pgpgout", .prepare = prepare_named_ull },
+ { .suffix = "stat.pgfault", .prepare = prepare_named_ull },
+ { .suffix = "stat.pgmajfault", .prepare = prepare_named_ull },
+ { .suffix = "stat.inactive_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.active_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.inactive_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.active_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.unevictable", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_cache", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_rss", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_rss_huge", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_mapped_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_writeback", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_swap", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_pgpgin", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_pgpgout", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_pgfault", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_pgmajfault", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_inactive_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_active_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_inactive_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_active_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.total_unevictable", .prepare = prepare_named_ull },
+ { .suffix = "stat.recent_rotated_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.recent_rotated_file", .prepare = prepare_named_ull },
+ { .suffix = "stat.recent_scanned_anon", .prepare = prepare_named_ull },
+ { .suffix = "stat.recent_scanned_file", .prepare = prepare_named_ull },
+};
+
+static cgroup_metrics_t netclass_metrics[] = {
+ { .suffix = "classid", .prepare = prepare_ull },
+};
+
+static cgroup_metrics_t blkio_metrics[] = {
+ { .suffix = "io_merged.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_merged.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_merged.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_merged.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_merged.total", .prepare = prepare_blocks_ull },
+ { .suffix = "io_queued.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_queued.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_queued.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_queued.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_queued.total", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_bytes.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_bytes.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_bytes.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_bytes.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_bytes.total", .prepare = prepare_blocks_ull },
+ { .suffix = "io_serviced.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_serviced.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_serviced.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_serviced.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_serviced.total", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_time.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_time.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_time.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_time.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_service_time.total", .prepare = prepare_blocks_ull },
+ { .suffix = "io_wait_time.read", .prepare = prepare_blocks_ull },
+ { .suffix = "io_wait_time.write", .prepare = prepare_blocks_ull },
+ { .suffix = "io_wait_time.sync", .prepare = prepare_blocks_ull },
+ { .suffix = "io_wait_time.async", .prepare = prepare_blocks_ull },
+ { .suffix = "io_wait_time.total", .prepare = prepare_blocks_ull },
+ { .suffix = "sectors", .prepare = prepare_block_ull },
+ { .suffix = "time", .prepare = prepare_block_ull },
+};
+
+static const char *block_stats_names[] = \
+ { "read", "write", "sync", "async", "total" };
+#define BLKIOS (sizeof(block_stats_names)/sizeof(block_stats_names[0]))
+
+static cgroup_subsys_t controllers[] = {
+ { .name = "cpu",
+ .cluster = CLUSTER_CPUSCHED_GROUPS,
+ .metrics = cpusched_metrics,
+ .metric_count = sizeof(cpusched_metrics) / sizeof(cgroup_metrics_t),
+ },
+ { .name = "cpuset",
+ .cluster = CLUSTER_CPUSET_GROUPS,
+ .metrics = cpuset_metrics,
+ .metric_count = sizeof(cpuset_metrics) / sizeof(cgroup_metrics_t),
+ },
+ { .name = "cpuacct",
+ .cluster = CLUSTER_CPUACCT_GROUPS,
+ .metrics = cpuacct_metrics,
+ .metric_count = sizeof(cpuacct_metrics) / sizeof(cgroup_metrics_t),
+ },
+ { .name = "memory",
+ .cluster = CLUSTER_MEMORY_GROUPS,
+ .metrics = memory_metrics,
+ .metric_count = sizeof(memory_metrics) / sizeof(cgroup_metrics_t),
+ },
+ { .name = "net_cls",
+ .cluster = CLUSTER_NET_CLS_GROUPS,
+ .metrics = netclass_metrics,
+ .metric_count = sizeof(netclass_metrics) / sizeof(cgroup_metrics_t),
+ },
+ { .name = "blkio",
+ .cluster = CLUSTER_BLKIO_GROUPS,
+ .metrics = blkio_metrics,
+ .metric_count = sizeof(blkio_metrics) / sizeof(cgroup_metrics_t),
+ },
+};
+
+/*
+ * Data structures used by individual cgroup subsystem controllers
+ */
+typedef struct {
+ __uint32_t major;
+ __uint32_t minor;
+ int inst;
+ char *name;
+} device_t;
+
+typedef struct {
+ device_t dev;
+ __uint64_t values[BLKIOS]; /* read, write, sync, async, total */
+} block_stats_t;
+
+typedef struct filesys {
+ int id;
+ char *device;
+ char *path;
+ char *options;
+} filesys_t;
+
+void
+refresh_cgroup_cpus(pmInDom indom)
+{
+ char buf[MAXPATHLEN];
+ char *space;
+ FILE *fp;
+
+ pmdaCacheOp(indom, PMDA_CACHE_INACTIVE);
+ if ((fp = proc_statsfile("/proc/stat", buf, sizeof(buf))) == NULL)
+ return;
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ if (strncmp(buf, "cpu", 3) == 0 && isdigit((int)buf[3])) {
+ if ((space = strchr(buf, ' ')) != NULL) {
+ *space = '\0';
+ pmdaCacheStore(indom, PMDA_CACHE_ADD, buf, NULL);
+ }
+ }
+ }
+ fclose(fp);
+}
+
+static int
+_pm_isloop(char *dname)
+{
+ return strncmp(dname, "loop", 4) == 0;
+}
+
+static int
+_pm_isramdisk(char *dname)
+{
+ return strncmp(dname, "ram", 3) == 0;
+}
+
+/*
+ * For block devices we have one instance domain for dev_t
+ * based lookup, and another for (real) name lookup.
+ * The reason we need this is that the blkio cgroup stats
+ * are exported using the major:minor numbers, and not the
+ * device names - we must perform that mapping ourselves.
+ * In some places (value refresh) we need to lookup the blk
+ * name from device major/minor, in other places (instances
+ * refresh) we need the usual external instid:name lookup.
+ */
+void
+refresh_cgroup_devices(pmInDom diskindom)
+{
+ pmInDom devtindom = INDOM(DEVT_INDOM);
+ char buf[MAXPATHLEN];
+ static time_t before;
+ time_t now;
+ FILE *fp;
+
+ if ((now = time(NULL)) == before)
+ return;
+ before = now;
+
+ pmdaCacheOp(devtindom, PMDA_CACHE_INACTIVE);
+ pmdaCacheOp(diskindom, PMDA_CACHE_INACTIVE);
+
+ if ((fp = proc_statsfile("/proc/diskstats", buf, sizeof(buf))) == NULL)
+ return;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ unsigned int major, minor, unused;
+ device_t *dev = NULL;
+ char namebuf[1024];
+ int inst;
+
+ if (sscanf(buf, "%u %u %s %u", &major, &minor, namebuf, &unused) != 4)
+ continue;
+ if (_pm_isloop(namebuf) || _pm_isramdisk(namebuf))
+ continue;
+ if (pmdaCacheLookupName(diskindom, namebuf, &inst, (void **)&dev) < 0 ||
+ dev == NULL) {
+ if (!(dev = (device_t *)malloc(sizeof(device_t)))) {
+ __pmNoMem("device", sizeof(device_t), PM_RECOV_ERR);
+ continue;
+ }
+ dev->major = major;
+ dev->minor = minor;
+ }
+ /* keeping track of all fields (major/minor/inst/name) */
+ pmdaCacheStore(diskindom, PMDA_CACHE_ADD, namebuf, dev);
+ pmdaCacheLookupName(diskindom, namebuf, &dev->inst, NULL);
+ pmdaCacheLookup(diskindom, dev->inst, &dev->name, NULL);
+
+ snprintf(buf, sizeof(buf), "%u:%u", major, minor);
+ pmdaCacheStore(devtindom, PMDA_CACHE_ADD, buf, (void *)dev);
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "refresh_devices: \"%s\" \"%d:%d\" inst=%d\n",
+ dev->name, dev->major, dev->minor, dev->inst);
+ }
+ fclose(fp);
+}
+
+void
+refresh_cgroup_subsys(pmInDom indom)
+{
+ char buf[4096];
+ static time_t before;
+ time_t now;
+ FILE *fp;
+
+ if ((now = time(NULL)) == before)
+ return;
+ before = now;
+
+ if ((fp = proc_statsfile("/proc/cgroups", buf, sizeof(buf))) == NULL)
+ return;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ unsigned int numcgroups, enabled;
+ char name[MAXPATHLEN];
+ long hierarchy;
+ long *data;
+ int sts;
+
+ /* skip lines starting with hash (header) */
+ if (buf[0] == '#')
+ continue;
+ if (sscanf(buf, "%s %ld %u %u", &name[0],
+ &hierarchy, &numcgroups, &enabled) != 4)
+ continue;
+ sts = pmdaCacheLookupName(indom, name, NULL, (void **)&data);
+ if (sts == PMDA_CACHE_ACTIVE) {
+ if (*data != hierarchy) {
+ /*
+ * odd ... instance name repeated but different
+ * hierarchy ... we cannot support more than one hierarchy
+ * yet
+ */
+ fprintf(stderr, "refresh_cgroup_subsys: \"%s\": entries for hierarchy %ld ignored (hierarchy %ld seen first)\n", name, hierarchy, *data);
+ }
+ continue;
+ }
+ else if (sts != PMDA_CACHE_INACTIVE) {
+ if ((data = (long *)malloc(sizeof(long))) == NULL) {
+#if PCP_DEBUG
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "refresh_cgroup_subsys: \"%s\": malloc failed\n", name);
+#endif
+ continue;
+ }
+ *data = hierarchy;
+ }
+ pmdaCacheStore(indom, PMDA_CACHE_ADD, name, (void *)data);
+#if PCP_DEBUG
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "refresh_cgroup_subsys: add \"%s\" [hierarchy %ld]\n", name, hierarchy);
+#endif
+ }
+ fclose(fp);
+}
+
+void
+refresh_cgroup_filesys(pmInDom indom)
+{
+ char buf[MAXPATHLEN];
+ filesys_t *fs;
+ FILE *fp;
+ time_t now;
+ static time_t before;
+ char *path, *device, *type, *options;
+ int sts;
+
+ if ((now = time(NULL)) == before)
+ return;
+ before = now;
+
+ pmdaCacheOp(indom, PMDA_CACHE_INACTIVE);
+
+ if ((fp = proc_statsfile("/proc/mounts", buf, sizeof(buf))) == NULL)
+ return;
+
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ device = strtok(buf, " ");
+ path = strtok(NULL, " ");
+ type = strtok(NULL, " ");
+ options = strtok(NULL, " ");
+ if (strcmp(type, "cgroup") != 0)
+ continue;
+
+ sts = pmdaCacheLookupName(indom, path, NULL, (void **)&fs);
+ if (sts == PMDA_CACHE_ACTIVE) /* repeated line in /proc/mounts? */
+ continue;
+ if (sts == PMDA_CACHE_INACTIVE) { /* re-activate an old mount */
+ pmdaCacheStore(indom, PMDA_CACHE_ADD, path, fs);
+ if (strcmp(path, fs->path) != 0) { /* old device, new path */
+ free(fs->path);
+ fs->path = strdup(path);
+ }
+ if (strcmp(options, fs->options) != 0) { /* old device, new opts */
+ free(fs->options);
+ fs->options = strdup(options);
+ }
+ }
+ else { /* new mount */
+ if ((fs = malloc(sizeof(filesys_t))) == NULL)
+ continue;
+ fs->path = strdup(path);
+ fs->options = strdup(options);
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "refresh_filesys: add \"%s\" \"%s\"\n",
+ fs->path, device);
+ pmdaCacheStore(indom, PMDA_CACHE_ADD, path, fs);
+ }
+ }
+ fclose(fp);
+}
+
+static char *
+scan_filesys_options(const char *options, const char *option)
+{
+ static char buffer[128];
+ char *s;
+
+ strncpy(buffer, options, sizeof(buffer));
+ buffer[sizeof(buffer)-1] = '\0';
+
+ s = strtok(buffer, ",");
+ while (s) {
+ if (strcmp(s, option) == 0)
+ return s;
+ s = strtok(NULL, ",");
+ }
+ return NULL;
+}
+
+static int
+read_values(char *buffer, int size, const char *path, const char *subsys,
+ const char *metric)
+{
+ int fd, count;
+
+ snprintf(buffer, size, "%s/%s.%s", path, subsys, metric);
+ if ((fd = open(buffer, O_RDONLY)) < 0)
+ return -oserror();
+ count = read(fd, buffer, size);
+ close(fd);
+ if (count < 0)
+ return -oserror();
+ buffer[count-1] = '\0';
+ return 0;
+}
+
+static pmID
+update_pmns(__pmnsTree *pmns, cgroup_subsys_t *subsys, const char *name,
+ cgroup_metrics_t *metrics, int group, int domain)
+{
+ char entry[MAXPATHLEN];
+ pmID pmid;
+
+ snprintf(entry, sizeof(entry), "%s.%s%s.%s",
+ CGROUP_ROOT, subsys->name, name, metrics->suffix);
+ pmid = cgroup_pmid_build(domain, subsys->cluster, group, metrics->item);
+ __pmAddPMNSNode(pmns, pmid, entry);
+ return pmid;
+}
+
+static int
+prepare_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain)
+{
+ int count = 0;
+ unsigned long long value;
+ char buffer[MAXPATHLEN];
+ char *endp, *p = &buffer[0];
+ cgroup_group_t *groups = &subsys->groups[group];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+ pmAtomValue *atoms = groups->metric_values[metric].atoms;
+
+ if (read_values(p, sizeof(buffer), path, subsys->name, metrics->suffix) < 0)
+ return -oserror();
+
+ while (p && *p) {
+ value = strtoull(p, &endp, 0);
+ if ((atoms = realloc(atoms, (count + 1) * sizeof(pmAtomValue))) == NULL)
+ return -oserror();
+ atoms[count++].ull = value;
+ if (endp == '\0' || endp == p)
+ break;
+ p = endp;
+ while (p && isspace((int)*p))
+ p++;
+ }
+
+ groups->metric_values[metric].item = metric;
+ groups->metric_values[metric].atoms = atoms;
+ groups->metric_values[metric].atom_count = count;
+ update_pmns(pmns, subsys, name, metrics, group, domain);
+ return 0;
+}
+
+static int
+prepare_named_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain)
+{
+ int i, count;
+ unsigned long long value;
+ char filename[64], buffer[MAXPATHLEN];
+ char *offset, *p = &buffer[0];
+ cgroup_group_t *groups = &subsys->groups[group];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+
+ /* metric => e.g. stat.user and stat.system - split it up first */
+ offset = index(metrics->suffix, '.');
+ if (!offset)
+ return PM_ERR_CONV;
+ count = (offset - metrics->suffix);
+ strncpy(filename, metrics->suffix, count);
+ filename[count] = '\0';
+
+ if (read_values(p, sizeof(buffer), path, subsys->name, filename) < 0)
+ return -oserror();
+
+ /* buffer contains <name> <value> pairs */
+ while (p && *p) {
+ char *endp, *field, *offset;
+
+ if ((field = index(p, ' ')) == NULL)
+ return PM_ERR_CONV;
+ offset = field + 1;
+ *field = '\0';
+ field = p; /* field now points to <name> */
+ p = offset;
+ value = strtoull(p, &endp, 0);
+ p = endp;
+ while (p && isspace((int)*p))
+ p++;
+
+ for (i = 0; i < subsys->metric_count; i++) {
+ pmAtomValue *atoms = groups->metric_values[i].atoms;
+ metrics = &subsys->metrics[i];
+
+ if (strcmp(field, metrics->suffix + count + 1) != 0)
+ continue;
+ if ((atoms = groups->metric_values[i].atoms) == NULL)
+ if ((atoms = calloc(1, sizeof(pmAtomValue))) == NULL)
+ return -oserror();
+ atoms[0].ull = value;
+
+ groups->metric_values[i].item = i;
+ groups->metric_values[i].atoms = atoms;
+ groups->metric_values[i].atom_count = 1;
+ update_pmns(pmns, subsys, name, metrics, group, domain);
+ break;
+ }
+ }
+ return 0;
+}
+
+static int
+prepare_block(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain,
+ block_stats_t *stats, int value_count)
+{
+ pmID pmid;
+ char *iname;
+ char buf[MAXPATHLEN];
+ device_t *dev;
+ pmAtomValue *atoms;
+ int count, size, inst, sts, m, i, j;
+ pmInDom devtindom = INDOM(DEVT_INDOM);
+ cgroup_group_t *groups = &subsys->groups[group];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+
+ /* map major:minor to real device name via diskstats */
+ dev = &stats->dev;
+ snprintf(buf, sizeof(buf), "%u:%u", dev->major, dev->minor);
+
+ sts = pmdaCacheLookupName(devtindom, buf, NULL, (void **)&dev);
+ iname = dev->name;
+ inst = dev->inst;
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "prepare_block: preparing %s found=%s (%s)\n",
+ buf, sts == PMDA_CACHE_ACTIVE ? "ok" : "no", iname);
+
+ /* batch update metric value(s) now, since we have 'em all */
+ for (j = 0; j < value_count; j++) {
+ m = metric + j;
+ atoms = groups->metric_values[m].atoms;
+ count = groups->metric_values[m].atom_count;
+
+ if (inst >= count) {
+ size = (inst + 1) * sizeof(pmAtomValue);
+ if ((atoms = realloc(atoms, size)) == NULL)
+ return -oserror();
+ for (i = count; i < inst + 1; i++)
+ atoms[i].ull = ULLONG_MAX;
+ count = inst + 1;
+ }
+ /* move on-stack value into global struct, add to PMNS */
+ atoms[inst].ull = stats->values[j];
+ pmid = update_pmns(pmns, subsys, name, metrics + j, group, domain);
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "prepare_block: prepared "
+ "metric=%s inst=%s[%d] value=%llu\n",
+ pmIDStr(pmid), iname, inst,
+ (unsigned long long)atoms[inst].ull);
+
+ groups->metric_values[m].item = m;
+ groups->metric_values[m].atoms = atoms;
+ groups->metric_values[m].atom_count = count;
+ }
+ return 0;
+}
+
+static int
+prepare_block_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain)
+{
+ char buf[MAXPATHLEN];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+ block_stats_t stats;
+ FILE *fp;
+ char *p;
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "prepare_block_ull: %s metric=%d group=%d domain=%d\n",
+ path, metric, group, domain);
+
+ snprintf(buf, sizeof(buf), "%s/%s.%s", path, subsys->name, metrics->suffix);
+ if ((fp = fopen(buf, "r")) == NULL)
+ return -oserror();
+
+ memset(&stats, 0, sizeof(stats));
+ while ((fgets(buf, sizeof(buf), fp)) != NULL) {
+ if (sscanf(buf, "%u:%u ", &stats.dev.major, &stats.dev.minor) != 2)
+ continue;
+ for (p = buf; *p && !isspace(*p); p++) { } /* skip device number */
+ for (p = buf; *p && isspace(*p); p++) { } /* skip over spaces */
+ if (sscanf(p, "%llu", (unsigned long long *)&stats.values[0]) != 1)
+ stats.values[0] = 0;
+ prepare_block(pmns, path, subsys, name,
+ metric, group, domain, &stats, 1);
+ }
+ fclose(fp);
+ return 0;
+}
+
+static int
+prepare_blocks_ull(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain)
+{
+ char buf[MAXPATHLEN];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+ block_stats_t stats;
+ FILE *fp;
+ char *p;
+ int j;
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "prepare_blocks_ull: %s metric=%d group=%d domain=%d\n",
+ path, metric, group, domain);
+
+ if (metric % BLKIOS != 0)
+ return 0;
+
+ snprintf(buf, sizeof(buf), "%s/%s.%s", path, subsys->name, metrics->suffix);
+ buf[strlen(buf) - sizeof("read")] = '\0';
+
+ if (pmDebug & DBG_TRACE_APPL2)
+ fprintf(stderr, "prepare_blocks_ull: opening \"%s\"\n", buf);
+
+ if ((fp = fopen(buf, "r")) == NULL)
+ return -oserror();
+
+ memset(&stats, 0, sizeof(stats));
+ while ((fgets(buf, sizeof(buf), fp)) != NULL) {
+ if (sscanf(buf, "%u:%u ", &stats.dev.major, &stats.dev.minor) != 2)
+ continue;
+
+ /* iterate over read/write/sync/async/total (reverse for async) */
+ for (j = BLKIOS-1; j >= 0; j--) {
+ if ((p = strcasestr(buf, block_stats_names[j])) == NULL)
+ continue;
+ p += strlen(block_stats_names[j]) + 1;
+ if (sscanf(p, "%llu", (unsigned long long *)&stats.values[j]) != 1)
+ stats.values[j] = 0;
+ break;
+ }
+
+ if (j == BLKIOS - 1) { /* Total: last one, update incore structures */
+ prepare_block(pmns, path, subsys, name,
+ metric, group, domain, &stats, BLKIOS);
+ /* reset on-stack structure for next outer loop iteration */
+ memset(&stats, 0, sizeof(stats));
+ }
+ }
+ fclose(fp);
+ return 0;
+}
+
+static int
+prepare_string(__pmnsTree *pmns, const char *path, cgroup_subsys_t *subsys,
+ const char *name, int metric, int group, int domain)
+{
+ char buffer[MAXPATHLEN];
+ cgroup_group_t *groups = &subsys->groups[group];
+ cgroup_metrics_t *metrics = &subsys->metrics[metric];
+ pmAtomValue *atoms = groups->metric_values[metric].atoms;
+ char *p = &buffer[0];
+
+ if (read_values(p, sizeof(buffer), path, subsys->name, metrics->suffix) < 0)
+ return -oserror();
+
+ if ((atoms = malloc(sizeof(pmAtomValue))) == NULL)
+ return -oserror();
+ if ((atoms[0].cp = strdup(buffer)) == NULL) {
+ free(atoms);
+ return -oserror();
+ }
+ groups->metric_values[metric].item = metric;
+ groups->metric_values[metric].atoms = atoms;
+ groups->metric_values[metric].atom_count = 1;
+ update_pmns(pmns, subsys, name, metrics, group, domain);
+ return 0;
+}
+
+static void
+translate(char *dest, const char *src, size_t size)
+{
+ char *p;
+
+ if (*src != '\0') /* non-root */
+ *dest = '.';
+ strncpy(dest, src, size);
+ for (p = dest; *p; p++) {
+ if (*p == '/')
+ *p = '.';
+ }
+}
+
+static int
+namespace(__pmnsTree *pmns, cgroup_subsys_t *subsys,
+ const char *cgrouppath, const char *cgroupname, int domain)
+{
+ int i, id;
+ size_t size;
+ cgroup_values_t *cvp;
+ char group[128];
+
+ translate(&group[0], cgroupname, sizeof(group));
+
+ /* allocate space for this group */
+ size = (subsys->group_count + 1) * sizeof(cgroup_group_t);
+ subsys->groups = (cgroup_group_t *)realloc(subsys->groups, size);
+ if (subsys->groups == NULL)
+ return -oserror();
+
+ /* allocate space for all values up-front */
+ size = subsys->metric_count;
+ cvp = (cgroup_values_t *)calloc(size, sizeof(cgroup_values_t));
+ if (cvp == NULL)
+ return -oserror();
+
+ id = subsys->group_count++;
+ memset(&subsys->groups[id], 0, sizeof(cgroup_group_t));
+ subsys->groups[id].id = id;
+ subsys->groups[id].metric_values = cvp;
+
+ for (i = 0; i < size; i++) {
+ cgroup_metrics_t *metrics = &subsys->metrics[i];
+ metrics->prepare(pmns, cgrouppath, subsys, group, i, id, domain);
+ }
+ return 1;
+}
+
+char *
+cgroup_find_subsys(pmInDom indom, void *data)
+{
+ static char dunno[] = "?";
+ static char opts[256];
+ char buffer[256];
+ char *s, *out = NULL;
+ filesys_t *fs = (filesys_t *)data;
+
+ memset(opts, 0, sizeof(opts));
+ strncpy(buffer, fs->options, sizeof(buffer));
+
+ s = strtok(buffer, ",");
+ while (s) {
+ if (pmdaCacheLookupName(indom, s, NULL, NULL) == PMDA_CACHE_ACTIVE) {
+ if (out) { /* append option */
+ strcat(out, ",");
+ strcat(out, s);
+ out += strlen(s) + 1; /* +1 => cater for comma */
+ } else { /* first option */
+ strcat(opts, s);
+ out = opts + strlen(s);
+ }
+ }
+ s = strtok(NULL, ",");
+ }
+ if (out)
+ return opts;
+ return dunno;
+}
+
+/* Ensure cgroup name can be used as a PCP namespace entry, ignore it if not */
+static int
+valid_pmns_name(char *name)
+{
+ if (!isalpha((int)name[0]))
+ return 0;
+ for (; *name != '\0'; name++)
+ if (!isalnum((int)*name) && *name != '_')
+ return 0;
+ return 1;
+}
+
+static int
+cgroup_scan(const char *mnt, const char *path, cgroup_subsys_t *subsys,
+ int domain, __pmnsTree *pmns, int root)
+{
+ int sts, length;
+ DIR *dirp;
+ struct stat sbuf;
+ struct dirent *dp;
+ char *cgroupname;
+ char cgrouppath[MAXPATHLEN];
+
+ if (root) {
+ snprintf(cgrouppath, sizeof(cgrouppath), "%s%s", proc_statspath, mnt);
+ length = strlen(cgrouppath);
+ } else {
+ snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s", proc_statspath, mnt, path);
+ length = strlen(proc_statspath) + strlen(mnt) + 1;
+ }
+
+ if ((dirp = opendir(cgrouppath)) == NULL)
+ return -oserror();
+
+ cgroupname = &cgrouppath[length];
+ sts = namespace(pmns, subsys, cgrouppath, cgroupname, domain);
+
+ /*
+ * readdir - descend into directories to find all cgroups, then
+ * populate namespace with <controller>[.<groupname>].<metrics>
+ */
+ while ((dp = readdir(dirp)) != NULL) {
+ int lsts;
+ if (!valid_pmns_name(dp->d_name))
+ continue;
+ if (path[0] == '\0')
+ snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s",
+ proc_statspath, mnt, dp->d_name);
+ else
+ snprintf(cgrouppath, sizeof(cgrouppath), "%s%s/%s/%s",
+ proc_statspath, mnt, path, dp->d_name);
+ cgroupname = &cgrouppath[length];
+ if (stat(cgrouppath, &sbuf) < 0)
+ continue;
+ if (!(S_ISDIR(sbuf.st_mode)))
+ continue;
+
+ lsts = namespace(pmns, subsys, cgrouppath, cgroupname, domain);
+ if (lsts > 0)
+ sts = 1;
+
+ /*
+ * also scan for any child cgroups, but cgroup_scan() may return
+ * an error
+ */
+ lsts = cgroup_scan(mnt, cgroupname, subsys, domain, pmns, 0);
+ if (lsts > 0)
+ sts = 1;
+ }
+ closedir(dirp);
+ return sts;
+}
+
+static void
+reset_subsys_stats(cgroup_subsys_t *subsys)
+{
+ int g, k, a;
+
+ for (g = 0; g < subsys->group_count; g++) {
+ cgroup_group_t *group = &subsys->groups[g];
+ for (k = 0; k < subsys->metric_count; k++) {
+ pmAtomValue *atoms = group->metric_values[k].atoms;
+ if (subsys->metrics[k].dynamic)
+ for (a = 0; a < group->metric_values[k].atom_count; a++)
+ free(atoms[a].cp);
+ free(atoms);
+ }
+ free(group->metric_values);
+ if (group->process_list.size)
+ free(group->process_list.pids);
+ memset(group, 0, sizeof(cgroup_group_t));
+ }
+ subsys->group_count = 0;
+}
+
+int
+refresh_cgroups(pmdaExt *pmda, __pmnsTree **pmns)
+{
+ int i, sts, mtab = 0;
+ int domain = pmda->e_domain;
+ filesys_t *fs;
+ time_t now;
+ static time_t before;
+ static __pmnsTree *beforetree;
+ __pmnsTree *tree = pmns ? *pmns : NULL;
+ pmInDom mounts = INDOM(CGROUP_MOUNTS_INDOM);
+ pmInDom devices = INDOM(DISK_INDOM);
+
+ now = time(NULL);
+ if (tree) {
+ if (now == before) {
+ *pmns = beforetree;
+ return 0;
+ }
+ } else if (now == before)
+ return 0;
+
+ refresh_cgroup_filesys(mounts);
+ refresh_cgroup_devices(devices);
+
+ if (tree)
+ __pmFreePMNS(tree);
+
+ if ((sts = __pmNewPMNS(&tree)) < 0) {
+ __pmNotifyErr(LOG_ERR, "%s: failed to create new pmns: %s\n",
+ pmProgname, pmErrStr(sts));
+ return 0;
+ }
+
+ for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) {
+ cgroup_subsys_t *subsys = &controllers[i];
+
+ /*
+ * Fetch latest state for subsystem and groups of the given clusters,
+ * by walking the cgroup mounts, finding the mounts of this subsystem
+ * type, and descending into all of the groups (subdirs)
+ */
+ reset_subsys_stats(subsys);
+
+ pmdaCacheOp(mounts, PMDA_CACHE_WALK_REWIND);
+ while ((sts = pmdaCacheOp(mounts, PMDA_CACHE_WALK_NEXT)) != -1) {
+ if (!pmdaCacheLookup(mounts, sts, NULL, (void **)&fs))
+ continue;
+ if (scan_filesys_options(fs->options, subsys->name) == NULL)
+ continue;
+ sts = cgroup_scan(fs->path, "", subsys, domain, tree, 1);
+ if (sts > 0)
+ mtab = 1;
+ }
+ }
+
+ if (pmns) {
+ *pmns = tree;
+ beforetree = tree;
+ before = now;
+ } else
+ __pmFreePMNS(tree);
+
+ return mtab;
+}
+
+/*
+ * Shared fetch callback for all cgroups metrics
+ */
+int
+cgroup_group_fetch(pmID pmid, unsigned int inst, pmAtomValue *atom)
+{
+ int i, j, k;
+ int gid, cluster, metric;
+
+ gid = cgroup_pmid_group(pmid);
+ metric = cgroup_pmid_metric(pmid);
+ cluster = proc_pmid_cluster(pmid);
+
+ for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) {
+ cgroup_subsys_t *subsys = &controllers[i];
+
+ if (subsys->cluster != cluster)
+ continue;
+ for (j = 0; j < subsys->group_count; j++) {
+ cgroup_group_t *group = &subsys->groups[j];
+
+ if (group->id != gid)
+ continue;
+ for (k = 0; k < subsys->metric_count; k++) {
+ cgroup_values_t *cvp = &group->metric_values[k];
+
+ if (cvp->item != metric)
+ continue;
+ else if (cvp->atom_count <= 0)
+ return PM_ERR_VALUE;
+ else if (inst == PM_IN_NULL)
+ inst = 0;
+ else if (inst >= cvp->atom_count)
+ return PM_ERR_INST;
+ else if (cvp->atoms[inst].ull == ULLONG_MAX)
+ return PM_ERR_INST;
+ *atom = cvp->atoms[inst];
+ return 1;
+ }
+ }
+ }
+ return PM_ERR_PMID;
+}
+
+/*
+ * Needs to answer the question: how much extra space needs to be allocated
+ * in the metric table for (dynamic) cgroup metrics"? We have static entries
+ * for group ID zero - if we have any non-zero group IDs, we need entries to
+ * cover those. Return value is the number of additional entries needed.
+ */
+static void
+size_metrictable(int *total, int *trees)
+{
+ int i, g, maxgroup = 0, nmetrics = 0;
+
+ for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) {
+ cgroup_subsys_t *subsys = &controllers[i];
+
+ for (g = 0; g < subsys->group_count; g++) {
+ cgroup_group_t *group = &subsys->groups[g];
+
+ if (group->id > maxgroup)
+ maxgroup = group->id;
+ }
+ nmetrics += subsys->metric_count + 0; /* +1 for task.pid */
+ }
+
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "size_metrictable: %d total x %d trees\n",
+ nmetrics, maxgroup);
+
+ *total = nmetrics;
+ *trees = maxgroup;
+}
+
+/*
+ * Create new metric table entry for a group based on an existing one.
+ */
+static void
+refresh_metrictable(pmdaMetric *source, pmdaMetric *dest, int gid)
+{
+ int domain = pmid_domain(source->m_desc.pmid);
+ int cluster = proc_pmid_cluster(source->m_desc.pmid);
+ int item = pmid_item(source->m_desc.pmid);
+
+ memcpy(dest, source, sizeof(pmdaMetric));
+ dest->m_desc.pmid = cgroup_pmid_build(domain, cluster, gid, item);
+
+ if (pmDebug & DBG_TRACE_APPL1)
+ fprintf(stderr, "refresh_metrictable: (%p -> %p)\n", source, dest);
+ if (pmDebug & DBG_TRACE_APPL0)
+ fprintf(stderr, "cgroup metric ID dup: %d.[%d.%d].%d - %d.[%d.%d].%d\n",
+ domain, cluster,
+ cgroup_pmid_group(source->m_desc.pmid),
+ cgroup_pmid_metric(source->m_desc.pmid),
+ pmid_domain(dest->m_desc.pmid),
+ proc_pmid_cluster(dest->m_desc.pmid),
+ cgroup_pmid_group(dest->m_desc.pmid),
+ cgroup_pmid_metric(dest->m_desc.pmid));
+}
+
+static int
+cgroup_text(pmdaExt *pmda, pmID pmid, int type, char **buf)
+{
+ return PM_ERR_TEXT;
+}
+
+static void
+cgroup_metrics_init(pmdaMetric *metrics, int nmetrics)
+{
+ int i, j, item, cluster = 0;
+
+ for (i = 0; i < sizeof(controllers)/sizeof(controllers[0]); i++) {
+ cgroup_subsys_t *subsys = &controllers[i];
+
+ /* set initial default values for controller metrics item field */
+ for (j = 0; j < subsys->metric_count; j++)
+ subsys->metrics[j].item = j;
+
+ /* set initial seed values for dynamic PMIDs in global metric table */
+ for (j = item = 0; j < nmetrics; j++) {
+ if (pmid_cluster(metrics[j].m_desc.pmid) == subsys->cluster) {
+ if (cluster != subsys->cluster) {
+ cluster = subsys->cluster;
+ item = 0;
+ }
+ metrics[j].m_desc.pmid = PMDA_PMID(cluster, item++);
+ }
+ }
+ }
+}
+
+void
+cgroup_init(pmdaMetric *metrics, int nmetrics)
+{
+ static int set[] = {
+ CLUSTER_BLKIO_GROUPS,
+ CLUSTER_CPUSET_GROUPS,
+ CLUSTER_CPUACCT_GROUPS,
+ CLUSTER_CPUSCHED_GROUPS,
+ CLUSTER_MEMORY_GROUPS,
+ CLUSTER_NET_CLS_GROUPS,
+ };
+
+ cgroup_metrics_init(metrics, nmetrics);
+
+ pmdaDynamicPMNS(CGROUP_ROOT,
+ set, sizeof(set) / sizeof(set[0]),
+ refresh_cgroups, cgroup_text,
+ refresh_metrictable, size_metrictable,
+ metrics, nmetrics);
+ pmdaDynamicSetClusterMask(CGROUP_ROOT, CGROUP_MASK);
+}