summaryrefslogtreecommitdiff
path: root/src/pmdas/nvidia
diff options
context:
space:
mode:
Diffstat (limited to 'src/pmdas/nvidia')
-rw-r--r--src/pmdas/nvidia/GNUmakefile50
-rwxr-xr-xsrc/pmdas/nvidia/Install28
-rwxr-xr-xsrc/pmdas/nvidia/README7
-rwxr-xr-xsrc/pmdas/nvidia/Remove38
-rw-r--r--src/pmdas/nvidia/help72
-rw-r--r--src/pmdas/nvidia/localnvml.c270
-rw-r--r--src/pmdas/nvidia/localnvml.h89
-rw-r--r--src/pmdas/nvidia/nvidia.c391
-rw-r--r--src/pmdas/nvidia/pmns30
-rw-r--r--src/pmdas/nvidia/root10
10 files changed, 985 insertions, 0 deletions
diff --git a/src/pmdas/nvidia/GNUmakefile b/src/pmdas/nvidia/GNUmakefile
new file mode 100644
index 0000000..622c726
--- /dev/null
+++ b/src/pmdas/nvidia/GNUmakefile
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2014 Red Hat.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+
+TOPDIR = ../../..
+include $(TOPDIR)/src/include/builddefs
+
+IAM = nvidia
+DOMAIN = NVML
+
+CMDTARGET = pmdanvidia$(EXECSUFFIX)
+LIBTARGET = pmda_nvidia.$(DSOSUFFIX)
+CFILES = localnvml.c nvidia.c
+HFILES = localnvml.h
+DFILES = README
+LSRCFILES = Install Remove root help pmns $(DFILES)
+LLDLIBS = $(PCP_PMDALIB) $(LIB_FOR_DLOPEN)
+LCFLAGS += -DDSOSUFFIX=\"$(DSOSUFFIX)\"
+
+PMDADIR = $(PCP_PMDAS_DIR)/$(IAM)
+LDIRT = domain.h *.log *.dir *.pag so_locations
+
+default: $(LIBTARGET) $(CMDTARGET)
+
+include $(BUILDRULES)
+
+install: default
+ $(INSTALL) -m 755 -d $(PMDADIR)
+ $(INSTALL) -m 755 Install Remove $(PMDADIR)
+ $(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(PMDADIR)
+ $(INSTALL) -m 644 $(DFILES) root help pmns domain.h $(PMDADIR)
+
+nvidia.o: domain.h
+
+domain.h: ../../pmns/stdpmid
+ $(DOMAIN_MAKERULE)
+
+default_pcp: default
+
+install_pcp: install
diff --git a/src/pmdas/nvidia/Install b/src/pmdas/nvidia/Install
new file mode 100755
index 0000000..6fd401e
--- /dev/null
+++ b/src/pmdas/nvidia/Install
@@ -0,0 +1,28 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# Install the trivial PMDA and/or PMNS
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=nvidia
+pmda_interface=2
+dso_opt=true
+forced_restart=false
+
+pmdaSetup
+pmdaInstall
+exit 0
diff --git a/src/pmdas/nvidia/README b/src/pmdas/nvidia/README
new file mode 100755
index 0000000..114896d
--- /dev/null
+++ b/src/pmdas/nvidia/README
@@ -0,0 +1,7 @@
+Readme
+NVIDIA PMDA
+===========
+
+The NVIDIA PMDA is a PCP module for gathering metrics on the performance of
+NVIDIA graphics cards. It uses the NVIDIA Management Library (NVML) to query
+the states of attached cards.
diff --git a/src/pmdas/nvidia/Remove b/src/pmdas/nvidia/Remove
new file mode 100755
index 0000000..5e28c15
--- /dev/null
+++ b/src/pmdas/nvidia/Remove
@@ -0,0 +1,38 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# Remove the nvidia PMDA
+#
+
+# source the PCP configuration environment variables
+. $PCP_DIR/etc/pcp.env
+
+# Get the common procedures and variable assignments
+#
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+# The name of the PMDA
+#
+iam=nvidia
+
+# Do it
+#
+pmdaSetup
+pmdaRemove
+
+exit 0
diff --git a/src/pmdas/nvidia/help b/src/pmdas/nvidia/help
new file mode 100644
index 0000000..38f7b4a
--- /dev/null
+++ b/src/pmdas/nvidia/help
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2014 Red Hat.
+# Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# NVIDIA PMDA help file in the ASCII format
+#
+# lines beginning with a # are ignored
+# lines beginning @ introduce a new entry of the form
+# @ metric_name oneline-text
+# help test goes
+# here over multiple lines
+# ...
+#
+# the metric_name is decoded against the default PMNS -- as a special case,
+# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an
+# instance domain identification, and the text describes the instance domain
+#
+# blank lines before the @ line are ignored
+#
+
+@ nvidia.numcards Number of Graphics Cards
+The number of NVIDIA Graphics cards installed in this system
+
+@ nvidia.gpuid GPU ID
+Zero indexed id of this NVIDIA card
+
+@ nvidia.cardname GPU Name
+The name of the graphics card
+
+@ nvidia.busid Card Bus ID
+The Bus ID as reported by the NVIDIA tools, not lspci
+
+@ nvidia.temp The temperature of the card
+The Temperature of the GPU on the NVIDIA card in degrees celcius.
+
+@ nvidia.fanspeed Fanspeed
+Speed of the GPU fan as a percentage of the maximum
+
+@ nvidia.perfstate NVIDIA performance state
+The PX performance state as reported from NVML. Value is an integer
+which should range from 0 (maximum performance) to 15 (minimum). If
+the state is unknown the reported value will be 32, however.
+
+@ nvidia.gpuactive Percentage of GPU utilization
+Percentage of time over the past sample period during which one or more
+kernels was executing on the GPU.
+
+@ nvidia.memactive Percentage of time spent accessing memory
+Percent of time over the past sample period during which global (device)
+memory was being read or written. This metric shows if the memory is
+actively being accesed, and is not correlated to storage amount used.
+
+@ nvidia.memused Allocated FB memory
+Amount of GPU FB memory that has currently been allocated, in bytes.
+Note that the driver/GPU always sets aside a small amount of memory
+for bookkeeping.
+
+@ nvidia.memtotal Total FB memory available
+The total amount of GPU FB memory available on the card, in bytes.
+
+@ nvidia.memfree Unallocated FB memory
+Amount of GPU FB memory that is not currently allocated, in bytes.
diff --git a/src/pmdas/nvidia/localnvml.c b/src/pmdas/nvidia/localnvml.c
new file mode 100644
index 0000000..2cadeb9
--- /dev/null
+++ b/src/pmdas/nvidia/localnvml.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+#include "pmapi.h"
+#include "impl.h"
+#if defined(HAVE_DLFCN_H)
+#include <dlfcn.h>
+#endif
+#include "localnvml.h"
+
+/*
+ * Implements NVML interfaces based on:
+ * http://docs.nvidia.com/deploy/nvml-api/index.html
+ * ... using either a dlopen'd 3rd party or "no values available".
+ */
+
+struct {
+ const char *symbol;
+ void *handle;
+} nvml_symtab[] = {
+ { .symbol = "nvmlInit" },
+ { .symbol = "nvmlShutdown" },
+ { .symbol = "nvmlDeviceGetCount" },
+ { .symbol = "nvmlDeviceGetHandleByIndex" },
+ { .symbol = "nvmlDeviceGetName" },
+ { .symbol = "nvmlDeviceGetPciInfo" },
+ { .symbol = "nvmlDeviceGetFanSpeed" },
+ { .symbol = "nvmlDeviceGetTemperature" },
+ { .symbol = "nvmlDeviceGetUtilizationRates" },
+ { .symbol = "nvmlDeviceGetMemoryInfo" },
+ { .symbol = "nvmlDeviceGetPerformanceState" },
+};
+enum {
+ NVML_INIT,
+ NVML_SHUTDOWN,
+ NVML_DEVICE_GET_COUNT,
+ NVML_DEVICE_GET_HANDLEBYINDEX,
+ NVML_DEVICE_GET_NAME,
+ NVML_DEVICE_GET_PCIINFO,
+ NVML_DEVICE_GET_FANSPEED,
+ NVML_DEVICE_GET_TEMPERATURE,
+ NVML_DEVICE_GET_UTILIZATIONRATES,
+ NVML_DEVICE_GET_MEMORYINFO,
+ NVML_DEVICE_GET_PERFORMANCESTATE,
+ NVML_SYMBOL_COUNT
+};
+typedef int (*local_init_t)(void);
+typedef int (*local_shutdown_t)(void);
+typedef int (*local_dev_get_count_t)(unsigned int *);
+typedef int (*local_dev_get_handlebyindex_t)(unsigned int, nvmlDevice_t *);
+typedef int (*local_dev_get_name_t)(nvmlDevice_t, char *, unsigned int);
+typedef int (*local_dev_get_pciinfo_t)(nvmlDevice_t, nvmlPciInfo_t *);
+typedef int (*local_dev_get_fanspeed_t)(nvmlDevice_t, unsigned int *);
+typedef int (*local_dev_get_temperature_t)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
+typedef int (*local_dev_get_utilizationrates_t)(nvmlDevice_t, nvmlUtilization_t *);
+typedef int (*local_dev_get_memoryinfo_t)(nvmlDevice_t, nvmlMemory_t *);
+typedef int (*local_dev_get_performancestate_t)(nvmlDevice_t, nvmlPstates_t *);
+
+static int
+resolve_symbols(void)
+{
+ static void *nvml_dso;
+ int i;
+
+ if (nvml_dso != NULL)
+ return 0;
+ if ((nvml_dso = dlopen("libnvidia-ml." DSOSUFFIX, RTLD_NOW)) == NULL)
+ return NVML_ERROR_LIBRARY_NOT_FOUND;
+ __pmNotifyErr(LOG_INFO, "Successfully loaded NVIDIA NVML library");
+ for (i = 0; i < NVML_SYMBOL_COUNT; i++)
+ nvml_symtab[i].handle = dlsym(nvml_dso, nvml_symtab[i].symbol);
+ return 0;
+}
+
+int
+localNvmlInit(void)
+{
+ local_init_t init;
+ void *func;
+ int sts = resolve_symbols();
+
+ if (sts != 0)
+ return sts;
+ if ((func = nvml_symtab[NVML_INIT].handle) == NULL)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ init = (local_init_t)func;
+ return init();
+}
+
+int
+localNvmlShutdown(void)
+{
+ local_shutdown_t shutdown;
+ void *func = nvml_symtab[NVML_SHUTDOWN].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ shutdown = (local_shutdown_t)func;
+ return shutdown();
+}
+
+int
+localNvmlDeviceGetCount(unsigned int *count)
+{
+ local_dev_get_count_t dev_get_count;
+ void *func = nvml_symtab[NVML_DEVICE_GET_COUNT].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_count = (local_dev_get_count_t)func;
+ return dev_get_count(count);
+}
+
+int
+localNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)
+{
+ local_dev_get_handlebyindex_t dev_get_handlebyindex;
+ void *func = nvml_symtab[NVML_DEVICE_GET_HANDLEBYINDEX].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_handlebyindex = (local_dev_get_handlebyindex_t)func;
+ return dev_get_handlebyindex(index, device);
+}
+
+int
+localNvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int size)
+{
+ local_dev_get_name_t dev_get_name;
+ void *func = nvml_symtab[NVML_DEVICE_GET_NAME].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_name = (local_dev_get_name_t)func;
+ return dev_get_name(device, name, size);
+}
+
+int
+localNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *info)
+{
+ local_dev_get_pciinfo_t dev_get_pciinfo;
+ void *func = nvml_symtab[NVML_DEVICE_GET_PCIINFO].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_pciinfo = (local_dev_get_pciinfo_t)func;
+ return dev_get_pciinfo(device, info);
+}
+
+int
+localNvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed)
+{
+ local_dev_get_fanspeed_t dev_get_fanspeed;
+ void *func = nvml_symtab[NVML_DEVICE_GET_FANSPEED].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_fanspeed = (local_dev_get_fanspeed_t)func;
+ return dev_get_fanspeed(device, speed);
+}
+
+int
+localNvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t code, unsigned int *temp)
+{
+ local_dev_get_temperature_t dev_get_temperature;
+ void *func = nvml_symtab[NVML_DEVICE_GET_TEMPERATURE].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_temperature = (local_dev_get_temperature_t)func;
+ return dev_get_temperature(device, code, temp);
+}
+
+int
+localNvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *util)
+{
+ local_dev_get_utilizationrates_t dev_get_utilizationrates;
+ void *func = nvml_symtab[NVML_DEVICE_GET_UTILIZATIONRATES].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_utilizationrates = (local_dev_get_utilizationrates_t)func;
+ return dev_get_utilizationrates(device, util);
+}
+
+int
+localNvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory)
+{
+ local_dev_get_memoryinfo_t dev_get_memoryinfo;
+ void *func = nvml_symtab[NVML_DEVICE_GET_MEMORYINFO].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_memoryinfo = (local_dev_get_memoryinfo_t)func;
+ return dev_get_memoryinfo(device, memory);
+}
+
+int
+localNvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *state)
+{
+ local_dev_get_performancestate_t dev_get_performancestate;
+ void *func = nvml_symtab[NVML_DEVICE_GET_PERFORMANCESTATE].handle;
+
+ if (!func)
+ return NVML_ERROR_FUNCTION_NOT_FOUND;
+ dev_get_performancestate = (local_dev_get_performancestate_t)func;
+ return dev_get_performancestate(device, state);
+}
+
+const char *
+localNvmlErrStr(nvmlReturn_t sts)
+{
+ int i;
+ static const char *unknown = "No such error code";
+ static struct {
+ int code;
+ const char *msg;
+ } table[] = { {
+ NVML_SUCCESS,
+"The operation was successful" }, {
+ NVML_ERROR_UNINITIALIZED,
+"NVML was not first initialized with nvmlInit()" }, {
+ NVML_ERROR_INVALID_ARGUMENT,
+"A supplied argument is invalid" }, {
+ NVML_ERROR_NOT_SUPPORTED,
+"The requested operation is not available on target device" }, {
+ NVML_ERROR_NO_PERMISSION,
+"The current user does not have permission for operation" }, {
+ NVML_ERROR_ALREADY_INITIALIZED,
+"Deprecated error code (5)" }, {
+ NVML_ERROR_NOT_FOUND,
+"A query to find an object was unsuccessful" }, {
+ NVML_ERROR_INSUFFICIENT_SIZE,
+"An input argument is not large enough" }, {
+ NVML_ERROR_INSUFFICIENT_POWER,
+"A device's external power cables are not properly attached" }, {
+ NVML_ERROR_DRIVER_NOT_LOADED,
+"NVIDIA driver is not loaded" }, {
+ NVML_ERROR_TIMEOUT,
+"User provided timeout passed" }, {
+ NVML_ERROR_IRQ_ISSUE,
+"NVIDIA Kernel detected an interrupt issue with a GPU" }, {
+ NVML_ERROR_LIBRARY_NOT_FOUND,
+"NVML Shared Library couldn't be found or loaded" }, {
+ NVML_ERROR_FUNCTION_NOT_FOUND,
+"Local version of NVML doesn't implement this function" }, {
+ NVML_ERROR_CORRUPTED_INFOROM,
+"infoROM is corrupted" }, {
+ NVML_ERROR_GPU_IS_LOST,
+"The GPU has fallen off the bus or has otherwise become inaccessible" }, {
+ NVML_ERROR_UNKNOWN,
+"An internal driver error occurred"
+ } };
+
+ for (i = 0; i < (sizeof(table)/sizeof(table[0])); i++) {
+ if (table[i].code == sts)
+ return table[i].msg;
+ }
+ return unknown;
+}
diff --git a/src/pmdas/nvidia/localnvml.h b/src/pmdas/nvidia/localnvml.h
new file mode 100644
index 0000000..3d108e5
--- /dev/null
+++ b/src/pmdas/nvidia/localnvml.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+#ifndef _LOCAL_NVML_H
+#define _LOCAL_NVML_H
+
+/*
+ * NVML interfaces and data structures, based on:
+ * http://docs.nvidia.com/deploy/nvml-api/index.html
+ */
+
+#define NVML_DEVICE_NAME_BUFFER_SIZE 64
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+
+typedef void *nvmlDevice_t; /* used as an opaque handle */
+typedef int nvmlPstates_t; /* performance state (0-15) */
+
+/* Error codes */
+typedef enum {
+ NVML_SUCCESS = 0,
+ NVML_ERROR_UNINITIALIZED = 1,
+ NVML_ERROR_INVALID_ARGUMENT = 2,
+ NVML_ERROR_NOT_SUPPORTED = 3,
+ NVML_ERROR_NO_PERMISSION = 4,
+ NVML_ERROR_ALREADY_INITIALIZED = 5,
+ NVML_ERROR_NOT_FOUND = 6,
+ NVML_ERROR_INSUFFICIENT_SIZE = 7,
+ NVML_ERROR_INSUFFICIENT_POWER = 8,
+ NVML_ERROR_DRIVER_NOT_LOADED = 9,
+ NVML_ERROR_TIMEOUT = 10,
+ NVML_ERROR_IRQ_ISSUE = 11,
+ NVML_ERROR_LIBRARY_NOT_FOUND = 12,
+ NVML_ERROR_FUNCTION_NOT_FOUND = 13,
+ NVML_ERROR_CORRUPTED_INFOROM = 14,
+ NVML_ERROR_GPU_IS_LOST = 15,
+ NVML_ERROR_UNKNOWN = 999
+} nvmlReturn_t;
+
+typedef enum {
+ NVML_TEMPERATURE_GPU = 0,
+ NVML_TEMPERATURE_COUNT
+} nvmlTemperatureSensors_t;
+
+typedef struct {
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ unsigned int domain;
+ unsigned int bus;
+ unsigned int device;
+ unsigned int pciDeviceId;
+ unsigned int pciSubSystemId;
+ unsigned int reserved[4];
+} nvmlPciInfo_t;
+
+typedef struct {
+ unsigned int gpu;
+ unsigned int memory;
+} nvmlUtilization_t;
+
+typedef struct {
+ unsigned long long total;
+ unsigned long long free;
+ unsigned long long used;
+} nvmlMemory_t;
+
+extern int localNvmlInit(void);
+extern int localNvmlShutdown(void);
+extern const char *localNvmlErrStr(nvmlReturn_t);
+
+extern int localNvmlDeviceGetCount(unsigned int *);
+extern int localNvmlDeviceGetHandleByIndex(unsigned int, nvmlDevice_t *);
+extern int localNvmlDeviceGetName(nvmlDevice_t, char *, unsigned int);
+extern int localNvmlDeviceGetPciInfo(nvmlDevice_t, nvmlPciInfo_t *);
+extern int localNvmlDeviceGetFanSpeed(nvmlDevice_t, unsigned int *);
+extern int localNvmlDeviceGetTemperature(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
+extern int localNvmlDeviceGetUtilizationRates(nvmlDevice_t, nvmlUtilization_t *);
+extern int localNvmlDeviceGetMemoryInfo(nvmlDevice_t, nvmlMemory_t *);
+extern int localNvmlDeviceGetPerformanceState(nvmlDevice_t, nvmlPstates_t *);
+
+#endif /* _LOCAL_NVML_H */
diff --git a/src/pmdas/nvidia/nvidia.c b/src/pmdas/nvidia/nvidia.c
new file mode 100644
index 0000000..849f51f
--- /dev/null
+++ b/src/pmdas/nvidia/nvidia.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+#include "pmapi.h"
+#include "impl.h"
+#include "pmda.h"
+#include "domain.h"
+#include "localnvml.h"
+
+/* InDom table (just one row - corresponding to the set of graphics cards) */
+enum { GCARD_INDOM = 0 };
+pmdaIndom indomtab[] = {
+ { GCARD_INDOM, 0, NULL },
+};
+
+/* List of metric item numbers - increasing from zero, no holes */
+enum {
+ NVIDIA_NUMCARDS = 0,
+ NVIDIA_CARDID,
+ NVIDIA_CARDNAME,
+ NVIDIA_BUSID,
+ NVIDIA_TEMP,
+ NVIDIA_FANSPEED,
+ NVIDIA_PERFSTATE,
+ NVIDIA_GPUACTIVE,
+ NVIDIA_MEMACTIVE,
+ NVIDIA_MEMUSED,
+ NVIDIA_MEMTOTAL,
+ NVIDIA_MEMFREE,
+
+ NVIDIA_METRIC_COUNT
+};
+
+/* Table of metrics exported by this PMDA */
+static pmdaMetric metrictab[] = {
+ { NULL, { PMDA_PMID(0, NVIDIA_NUMCARDS), PM_TYPE_U32, PM_INDOM_NULL,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_CARDID), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_CARDNAME), PM_TYPE_STRING, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_BUSID), PM_TYPE_STRING, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_TEMP), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_FANSPEED), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_PERFSTATE), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_GPUACTIVE), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_MEMACTIVE), PM_TYPE_U32, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_MEMUSED), PM_TYPE_U64, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_MEMTOTAL), PM_TYPE_U64, GCARD_INDOM,
+ PM_SEM_DISCRETE, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+ { NULL, { PMDA_PMID(0, NVIDIA_MEMFREE), PM_TYPE_U64, GCARD_INDOM,
+ PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+};
+
+/* GCARD_INDOM struct, stats that are per card */
+typedef struct {
+ int cardid;
+ int failed[NVIDIA_METRIC_COUNT];
+ char *name;
+ char *busid;
+ int temp;
+ int fanspeed;
+ int perfstate;
+ nvmlUtilization_t active;
+ nvmlMemory_t memory;
+} nvinfo_t;
+
+/* overall struct, holds instance values, indom and instance struct arrays */
+typedef struct {
+ int numcards;
+ int maxcards;
+ nvinfo_t *nvinfo;
+ pmdaIndom *nvindom;
+} pcp_nvinfo_t;
+
+static pcp_nvinfo_t pcp_nvinfo;
+static char mypath[MAXPATHLEN];
+static int isDSO = 1;
+static int nvmlDSO_loaded;
+
+static int
+setup_gcard_indom(void)
+{
+ unsigned int device_count = 0;
+ pmdaIndom *idp = &indomtab[GCARD_INDOM];
+ char gpuname[32], *name;
+ size_t size;
+ int i, sts;
+
+ /* Initialize instance domain and instances. */
+ if ((sts = localNvmlDeviceGetCount(&device_count)) != NVML_SUCCESS) {
+ __pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s",
+ localNvmlErrStr(sts));
+ return sts;
+ }
+
+ pcp_nvinfo.nvindom = idp;
+ pcp_nvinfo.nvindom->it_numinst = 0;
+
+ size = device_count * sizeof(pmdaInstid);
+ pcp_nvinfo.nvindom->it_set = (pmdaInstid *)malloc(size);
+ if (!pcp_nvinfo.nvindom->it_set) {
+ __pmNoMem("gcard indom", size, PM_RECOV_ERR);
+ return -ENOMEM;
+ }
+
+ size = device_count * sizeof(nvinfo_t);
+ if ((pcp_nvinfo.nvinfo = (nvinfo_t *)malloc(size)) == NULL) {
+ __pmNoMem("gcard values", size, PM_RECOV_ERR);
+ free(pcp_nvinfo.nvindom->it_set);
+ return -ENOMEM;
+ }
+ memset(pcp_nvinfo.nvinfo, 0, size);
+
+ for (i = 0; i < device_count; i++) {
+ pcp_nvinfo.nvindom->it_set[i].i_inst = i;
+ snprintf(gpuname, sizeof(gpuname), "gpu%d", i);
+ if ((name = strdup(gpuname)) == NULL) {
+ __pmNoMem("gcard instname", strlen(gpuname), PM_RECOV_ERR);
+ while (--i)
+ free(pcp_nvinfo.nvindom->it_set[i].i_name);
+ free(pcp_nvinfo.nvindom->it_set);
+ free(pcp_nvinfo.nvinfo);
+ return -ENOMEM;
+ }
+ pcp_nvinfo.nvindom->it_set[i].i_name = name;
+ }
+
+ pcp_nvinfo.numcards = 0;
+ pcp_nvinfo.maxcards = device_count;
+ pcp_nvinfo.nvindom->it_numinst = device_count;
+ return 0;
+}
+
+static int
+refresh(pcp_nvinfo_t *pcp_nvinfo)
+{
+ unsigned int device_count;
+ nvmlDevice_t device;
+ char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+ nvmlPciInfo_t pci;
+ unsigned int fanspeed;
+ unsigned int temperature;
+ nvmlUtilization_t utilization;
+ nvmlMemory_t memory;
+ nvmlPstates_t pstate;
+ int i, sts;
+
+ if (!nvmlDSO_loaded) {
+ if (localNvmlInit() == NVML_ERROR_LIBRARY_NOT_FOUND)
+ return 0;
+ setup_gcard_indom();
+ nvmlDSO_loaded = 1;
+ }
+
+ if ((sts = localNvmlDeviceGetCount(&device_count)) != 0) {
+ __pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s",
+ localNvmlErrStr(sts));
+ return sts;
+ }
+ pcp_nvinfo->numcards = device_count;
+
+ for (i = 0; i < device_count && i < pcp_nvinfo->maxcards; i++) {
+ pcp_nvinfo->nvinfo[i].cardid = i;
+ if ((sts = localNvmlDeviceGetHandleByIndex(i, &device))) {
+ __pmNotifyErr(LOG_ERR, "nvmlDeviceGetHandleByIndex: %s",
+ localNvmlErrStr(sts));
+ memset(pcp_nvinfo->nvinfo[i].failed, 1, NVIDIA_METRIC_COUNT);
+ continue;
+ }
+ memset(pcp_nvinfo->nvinfo[i].failed, 0, NVIDIA_METRIC_COUNT);
+ if ((sts = localNvmlDeviceGetName(device, name, sizeof(name))))
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_CARDNAME] = 1;
+ if ((sts = localNvmlDeviceGetPciInfo(device, &pci)))
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_BUSID] = 1;
+ if ((sts = localNvmlDeviceGetFanSpeed(device, &fanspeed)))
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_FANSPEED] = 1;
+ if ((sts = localNvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature)))
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_TEMP] = 1;
+ if ((sts = localNvmlDeviceGetUtilizationRates(device, &utilization))) {
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_GPUACTIVE] = 1;
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMACTIVE] = 1;
+ }
+ if ((sts = localNvmlDeviceGetMemoryInfo(device, &memory))) {
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMUSED] = 1;
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMTOTAL] = 1;
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMFREE] = 1;
+ }
+ if ((sts = localNvmlDeviceGetPerformanceState(device, &pstate)))
+ pcp_nvinfo->nvinfo[i].failed[NVIDIA_PERFSTATE] = 1;
+
+ if (pcp_nvinfo->nvinfo[i].name == NULL)
+ pcp_nvinfo->nvinfo[i].name = strdup(name);
+ if (pcp_nvinfo->nvinfo[i].busid == NULL)
+ pcp_nvinfo->nvinfo[i].busid = strdup(pci.busId);
+ pcp_nvinfo->nvinfo[i].temp = temperature;
+ pcp_nvinfo->nvinfo[i].fanspeed = fanspeed;
+ pcp_nvinfo->nvinfo[i].perfstate = pstate;
+ pcp_nvinfo->nvinfo[i].active = utilization; /* struct copy */
+ pcp_nvinfo->nvinfo[i].memory = memory; /* struct copy */
+ }
+
+ return 0;
+}
+
+/*
+ * Wrapper for pmdaFetch which refresh the set of values once per fetch
+ * PDU. The fetchCallback is then called once per-metric/instance pair
+ * to perform the actual filling of the pmResult (via each pmAtomValue).
+ */
+static int
+nvidia_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
+{
+ refresh(&pcp_nvinfo);
+ return pmdaFetch(numpmid, pmidlist, resp, pmda);
+}
+
+static int
+nvidia_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
+{
+ __pmID_int *idp = (__pmID_int *)&(mdesc->m_desc.pmid);
+
+ if (idp->cluster != 0)
+ return PM_ERR_PMID;
+ if (idp->item != 0 && inst > indomtab[GCARD_INDOM].it_numinst)
+ return PM_ERR_INST;
+
+ switch (idp->item) {
+ case NVIDIA_NUMCARDS:
+ atom->ul = pcp_nvinfo.numcards;
+ break;
+ case NVIDIA_CARDID:
+ atom->ul = pcp_nvinfo.nvinfo[inst].cardid;
+ break;
+ case NVIDIA_CARDNAME:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_CARDNAME])
+ return PM_ERR_VALUE;
+ atom->cp = pcp_nvinfo.nvinfo[inst].name;
+ break;
+ case NVIDIA_BUSID:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_BUSID])
+ return PM_ERR_VALUE;
+ atom->cp = pcp_nvinfo.nvinfo[inst].busid;
+ break;
+ case NVIDIA_TEMP:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_TEMP])
+ return PM_ERR_VALUE;
+ atom->ul = pcp_nvinfo.nvinfo[inst].temp;
+ break;
+ case NVIDIA_FANSPEED:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_FANSPEED])
+ return PM_ERR_VALUE;
+ atom->ul = pcp_nvinfo.nvinfo[inst].fanspeed;
+ break;
+ case NVIDIA_PERFSTATE:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_PERFSTATE])
+ return PM_ERR_VALUE;
+ atom->ul = pcp_nvinfo.nvinfo[inst].perfstate;
+ break;
+ case NVIDIA_GPUACTIVE:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_GPUACTIVE])
+ return PM_ERR_VALUE;
+ atom->ul = pcp_nvinfo.nvinfo[inst].active.gpu;
+ break;
+ case NVIDIA_MEMACTIVE:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMACTIVE])
+ return PM_ERR_VALUE;
+ atom->ul = pcp_nvinfo.nvinfo[inst].active.memory;
+ break;
+ case NVIDIA_MEMUSED:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMUSED])
+ return PM_ERR_VALUE;
+ atom->ull = pcp_nvinfo.nvinfo[inst].memory.used;
+ break;
+ case NVIDIA_MEMTOTAL:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMTOTAL])
+ return PM_ERR_VALUE;
+ atom->ull = pcp_nvinfo.nvinfo[inst].memory.total;
+ break;
+ case NVIDIA_MEMFREE:
+ if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMFREE])
+ return PM_ERR_VALUE;
+ atom->ull = pcp_nvinfo.nvinfo[inst].memory.free;
+ break;
+ default:
+ return PM_ERR_PMID;
+ }
+
+ return 0;
+}
+
+/**
+ * Initializes the path to the help file for this PMDA.
+ */
+static void
+initializeHelpPath()
+{
+ int sep = __pmPathSeparator();
+ snprintf(mypath, sizeof(mypath), "%s%c" "nvidia" "%c" "help",
+ pmGetConfig("PCP_PMDAS_DIR"), sep, sep);
+}
+
+void
+__PMDA_INIT_CALL
+nvidia_init(pmdaInterface *dp)
+{
+ int sts;
+
+ if (isDSO) {
+ initializeHelpPath();
+ pmdaDSO(dp, PMDA_INTERFACE_2, "nvidia DSO", mypath);
+ }
+
+ if (dp->status != 0)
+ return;
+
+ if ((sts = localNvmlInit()) == NVML_SUCCESS) {
+ setup_gcard_indom();
+ nvmlDSO_loaded = 1;
+ }
+ else {
+ /*
+ * This is OK, just continue on until it *is* installed;
+ * until that time, simply report "no values available".
+ */
+ __pmNotifyErr(LOG_INFO, "NVIDIA NVML library currently unavailable");
+ }
+
+ dp->version.any.fetch = nvidia_fetch;
+ pmdaSetFetchCallBack(dp, nvidia_fetchCallBack);
+
+ pmdaInit(dp, indomtab, sizeof(indomtab)/sizeof(indomtab[0]),
+ metrictab, sizeof(metrictab)/sizeof(metrictab[0]));
+}
+
+static pmLongOptions longopts[] = {
+ PMDA_OPTIONS_HEADER("Options"),
+ PMOPT_DEBUG,
+ PMDAOPT_DOMAIN,
+ PMDAOPT_LOGFILE,
+ PMOPT_HELP,
+ PMDA_OPTIONS_END
+};
+
+static pmdaOptions opts = {
+ .short_options = "D:d:l:?",
+ .long_options = longopts,
+};
+
+int
+main(int argc, char **argv)
+{
+ pmdaInterface desc;
+
+ isDSO = 0;
+ __pmSetProgname(argv[0]);
+
+ initializeHelpPath();
+ pmdaDaemon(&desc, PMDA_INTERFACE_2, pmProgname, NVML,
+ "nvidia.log", mypath);
+
+ pmdaGetOptions(argc, argv, &opts, &desc);
+ if (opts.errors) {
+ pmdaUsageMessage(&opts);
+ exit(1);
+ }
+
+ pmdaOpenLog(&desc);
+ pmdaConnect(&desc);
+ nvidia_init(&desc);
+ pmdaMain(&desc);
+
+ exit(0);
+}
diff --git a/src/pmdas/nvidia/pmns b/src/pmdas/nvidia/pmns
new file mode 100644
index 0000000..b496cae
--- /dev/null
+++ b/src/pmdas/nvidia/pmns
@@ -0,0 +1,30 @@
+/*
+ * Metrics for nvidia GPU PMDA
+ *
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+nvidia {
+ numcards NVML:0:0
+ gpuid NVML:0:1
+ cardname NVML:0:2
+ busid NVML:0:3
+ temp NVML:0:4
+ fanspeed NVML:0:5
+ perfstate NVML:0:6
+ gpuactive NVML:0:7
+ memactive NVML:0:8
+ memused NVML:0:9
+ memtotal NVML:0:10
+ memfree NVML:0:11
+}
diff --git a/src/pmdas/nvidia/root b/src/pmdas/nvidia/root
new file mode 100644
index 0000000..fe12bc2
--- /dev/null
+++ b/src/pmdas/nvidia/root
@@ -0,0 +1,10 @@
+/*
+ * fake "root" for validating the local PMNS subtree
+ */
+
+#include <stdpmid>
+
+root { nvidia }
+
+#include "pmns"
+