10 files changed, 985 insertions, 0 deletions
diff --git a/src/pmdas/nvidia/GNUmakefile b/src/pmdas/nvidia/GNUmakefile
new file mode 100644
index 0000000..622c726
--- /dev/null
+++ b/src/pmdas/nvidia/GNUmakefile
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2014 Red Hat.
+# 
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+# 
+
+TOPDIR = ../../..
+include $(TOPDIR)/src/include/builddefs
+
+IAM	= nvidia
+DOMAIN	= NVML
+
+CMDTARGET = pmdanvidia$(EXECSUFFIX)
+LIBTARGET = pmda_nvidia.$(DSOSUFFIX)
+CFILES	= localnvml.c nvidia.c
+HFILES	= localnvml.h
+DFILES	= README
+LSRCFILES = Install Remove root help pmns $(DFILES)
+LLDLIBS	= $(PCP_PMDALIB) $(LIB_FOR_DLOPEN)
+LCFLAGS += -DDSOSUFFIX=\"$(DSOSUFFIX)\"
+
+PMDADIR = $(PCP_PMDAS_DIR)/$(IAM)
+LDIRT	= domain.h *.log *.dir *.pag so_locations
+
+default:	$(LIBTARGET) $(CMDTARGET)
+
+include $(BUILDRULES)
+
+install:	default
+	$(INSTALL) -m 755 -d $(PMDADIR)
+	$(INSTALL) -m 755 Install Remove $(PMDADIR)
+	$(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(PMDADIR)
+	$(INSTALL) -m 644 $(DFILES) root help pmns domain.h $(PMDADIR)
+
+nvidia.o:	domain.h
+
+domain.h: ../../pmns/stdpmid
+	$(DOMAIN_MAKERULE)
+
+default_pcp:	default
+
+install_pcp:	install
diff --git a/src/pmdas/nvidia/Install b/src/pmdas/nvidia/Install
new file mode 100755
index 0000000..6fd401e
--- /dev/null
+++ b/src/pmdas/nvidia/Install
@@ -0,0 +1,28 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc.  All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# Install the trivial PMDA and/or PMNS
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=nvidia
+pmda_interface=2
+dso_opt=true
+forced_restart=false
+
+pmdaSetup
+pmdaInstall
+exit 0
diff --git a/src/pmdas/nvidia/README b/src/pmdas/nvidia/README
new file mode 100755
index 0000000..114896d
--- /dev/null
+++ b/src/pmdas/nvidia/README
@@ -0,0 +1,7 @@
+Readme
+NVIDIA PMDA
+===========
+
+The NVIDIA PMDA is a PCP module for gathering metrics on the performance of
+NVIDIA graphics cards. It uses the NVIDIA Management Library (NVML) to query
+the states of attached cards.
diff --git a/src/pmdas/nvidia/Remove b/src/pmdas/nvidia/Remove
new file mode 100755
index 0000000..5e28c15
--- /dev/null
+++ b/src/pmdas/nvidia/Remove
@@ -0,0 +1,38 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc.  All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# Remove the nvidia PMDA
+#
+
+# source the PCP configuration environment variables
+. $PCP_DIR/etc/pcp.env
+
+# Get the common procedures and variable assignments
+#
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+# The name of the PMDA
+#
+iam=nvidia
+
+# Do it
+#
+pmdaSetup
+pmdaRemove
+
+exit 0
diff --git a/src/pmdas/nvidia/help b/src/pmdas/nvidia/help
new file mode 100644
index 0000000..38f7b4a
--- /dev/null
+++ b/src/pmdas/nvidia/help
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2014 Red Hat.
+# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+# 
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+# 
+# NVIDIA PMDA help file in the ASCII format
+#
+# lines beginning with a # are ignored
+# lines beginning @ introduce a new entry of the form
+#  @ metric_name oneline-text
+#  help test goes
+#  here over multiple lines
+#  ...
+#
+# the metric_name is decoded against the default PMNS -- as a special case,
+# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an
+# instance domain identification, and the text describes the instance domain
+#
+# blank lines before the @ line are ignored
+#
+
+@ nvidia.numcards Number of Graphics Cards
+The number of NVIDIA Graphics cards installed in this system
+
+@ nvidia.gpuid GPU ID
+Zero indexed id of this NVIDIA card
+
+@ nvidia.cardname GPU Name
+The name of the graphics card
+
+@ nvidia.busid Card Bus ID
+The Bus ID as reported by the NVIDIA tools, not lspci
+
+@ nvidia.temp The temperature of the card
+The Temperature of the GPU on the NVIDIA card in degrees celcius.
+
+@ nvidia.fanspeed Fanspeed
+Speed of the GPU fan as a percentage of the maximum
+
+@ nvidia.perfstate NVIDIA performance state
+The PX performance state as reported from NVML.  Value is an integer
+which should range from 0 (maximum performance) to 15 (minimum).  If
+the state is unknown the reported value will be 32, however.
+
+@ nvidia.gpuactive Percentage of GPU utilization
+Percentage of time over the past sample period during which one or more
+kernels was executing on the GPU.
+
+@ nvidia.memactive Percentage of time spent accessing memory
+Percent of time over the past sample period during which global (device)
+memory was being read or written.  This metric shows if the memory is
+actively being accesed, and is not correlated to storage amount used.
+
+@ nvidia.memused Allocated FB memory
+Amount of GPU FB memory that has currently been allocated, in bytes.
+Note that the driver/GPU always sets aside a small amount of memory
+for bookkeeping. 
+
+@ nvidia.memtotal Total FB memory available
+The total amount of GPU FB memory available on the card, in bytes.
+
+@ nvidia.memfree Unallocated FB memory
+Amount of GPU FB memory that is not currently allocated, in bytes.
diff --git a/src/pmdas/nvidia/localnvml.c b/src/pmdas/nvidia/localnvml.c
new file mode 100644
index 0000000..2cadeb9
--- /dev/null
+++ b/src/pmdas/nvidia/localnvml.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+#include "pmapi.h"
+#include "impl.h"
+#if defined(HAVE_DLFCN_H)
+#include <dlfcn.h>
+#endif
+#include "localnvml.h"
+
+/*
+ * Implements NVML interfaces based on:
+ * http://docs.nvidia.com/deploy/nvml-api/index.html
+ * ... using either a dlopen'd 3rd party or "no values available".
+ */
+
+struct {
+    const char	*symbol;
+    void	*handle;
+} nvml_symtab[] = {
+    { .symbol = "nvmlInit" },
+    { .symbol = "nvmlShutdown" },
+    { .symbol = "nvmlDeviceGetCount" },
+    { .symbol = "nvmlDeviceGetHandleByIndex" },
+    { .symbol = "nvmlDeviceGetName" },
+    { .symbol = "nvmlDeviceGetPciInfo" },
+    { .symbol = "nvmlDeviceGetFanSpeed" },
+    { .symbol = "nvmlDeviceGetTemperature" },
+    { .symbol = "nvmlDeviceGetUtilizationRates" },
+    { .symbol = "nvmlDeviceGetMemoryInfo" },
+    { .symbol = "nvmlDeviceGetPerformanceState" },
+};
+enum {
+    NVML_INIT,
+    NVML_SHUTDOWN,
+    NVML_DEVICE_GET_COUNT,
+    NVML_DEVICE_GET_HANDLEBYINDEX,
+    NVML_DEVICE_GET_NAME,
+    NVML_DEVICE_GET_PCIINFO,
+    NVML_DEVICE_GET_FANSPEED,
+    NVML_DEVICE_GET_TEMPERATURE,
+    NVML_DEVICE_GET_UTILIZATIONRATES,
+    NVML_DEVICE_GET_MEMORYINFO,
+    NVML_DEVICE_GET_PERFORMANCESTATE,
+    NVML_SYMBOL_COUNT
+};
+typedef int (*local_init_t)(void);
+typedef int (*local_shutdown_t)(void);
+typedef int (*local_dev_get_count_t)(unsigned int *);
+typedef int (*local_dev_get_handlebyindex_t)(unsigned int, nvmlDevice_t *);
+typedef int (*local_dev_get_name_t)(nvmlDevice_t, char *, unsigned int);
+typedef int (*local_dev_get_pciinfo_t)(nvmlDevice_t, nvmlPciInfo_t *);
+typedef int (*local_dev_get_fanspeed_t)(nvmlDevice_t, unsigned int *);
+typedef int (*local_dev_get_temperature_t)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
+typedef int (*local_dev_get_utilizationrates_t)(nvmlDevice_t, nvmlUtilization_t *);
+typedef int (*local_dev_get_memoryinfo_t)(nvmlDevice_t, nvmlMemory_t *);
+typedef int (*local_dev_get_performancestate_t)(nvmlDevice_t, nvmlPstates_t *);
+
+static int
+resolve_symbols(void)
+{
+    static void *nvml_dso;
+    int i;
+
+    if (nvml_dso != NULL)
+	return 0;
+    if ((nvml_dso = dlopen("libnvidia-ml." DSOSUFFIX, RTLD_NOW)) == NULL)
+	return NVML_ERROR_LIBRARY_NOT_FOUND;
+    __pmNotifyErr(LOG_INFO, "Successfully loaded NVIDIA NVML library");
+    for (i = 0; i < NVML_SYMBOL_COUNT; i++)
+	nvml_symtab[i].handle = dlsym(nvml_dso, nvml_symtab[i].symbol);
+    return 0;
+}
+
+int
+localNvmlInit(void)
+{
+    local_init_t init;
+    void *func;
+    int sts = resolve_symbols();
+
+    if (sts != 0)
+	return sts;
+    if ((func = nvml_symtab[NVML_INIT].handle) == NULL)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    init = (local_init_t)func;
+    return init();
+}
+
+int
+localNvmlShutdown(void)
+{
+    local_shutdown_t shutdown;
+    void *func = nvml_symtab[NVML_SHUTDOWN].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    shutdown = (local_shutdown_t)func;
+    return shutdown();
+}
+
+int
+localNvmlDeviceGetCount(unsigned int *count)
+{
+    local_dev_get_count_t dev_get_count;
+    void *func = nvml_symtab[NVML_DEVICE_GET_COUNT].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_count = (local_dev_get_count_t)func;
+    return dev_get_count(count);
+}
+
+int
+localNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)
+{
+    local_dev_get_handlebyindex_t dev_get_handlebyindex;
+    void *func = nvml_symtab[NVML_DEVICE_GET_HANDLEBYINDEX].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_handlebyindex = (local_dev_get_handlebyindex_t)func;
+    return dev_get_handlebyindex(index, device);
+}
+
+int
+localNvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int size)
+{
+    local_dev_get_name_t dev_get_name;
+    void *func = nvml_symtab[NVML_DEVICE_GET_NAME].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_name = (local_dev_get_name_t)func;
+    return dev_get_name(device, name, size);
+}
+
+int
+localNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *info)
+{
+    local_dev_get_pciinfo_t dev_get_pciinfo;
+    void *func = nvml_symtab[NVML_DEVICE_GET_PCIINFO].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_pciinfo = (local_dev_get_pciinfo_t)func;
+    return dev_get_pciinfo(device, info);
+}
+
+int
+localNvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed)
+{
+    local_dev_get_fanspeed_t dev_get_fanspeed;
+    void *func = nvml_symtab[NVML_DEVICE_GET_FANSPEED].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_fanspeed = (local_dev_get_fanspeed_t)func;
+    return dev_get_fanspeed(device, speed);
+}
+
+int
+localNvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t code, unsigned int *temp)
+{
+    local_dev_get_temperature_t dev_get_temperature;
+    void *func = nvml_symtab[NVML_DEVICE_GET_TEMPERATURE].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_temperature = (local_dev_get_temperature_t)func;
+    return dev_get_temperature(device, code, temp);
+}
+
+int
+localNvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *util)
+{
+    local_dev_get_utilizationrates_t dev_get_utilizationrates;
+    void *func = nvml_symtab[NVML_DEVICE_GET_UTILIZATIONRATES].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_utilizationrates = (local_dev_get_utilizationrates_t)func;
+    return dev_get_utilizationrates(device, util);
+}
+
+int
+localNvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory)
+{
+    local_dev_get_memoryinfo_t dev_get_memoryinfo;
+    void *func = nvml_symtab[NVML_DEVICE_GET_MEMORYINFO].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_memoryinfo = (local_dev_get_memoryinfo_t)func;
+    return dev_get_memoryinfo(device, memory);
+}
+
+int
+localNvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *state)
+{
+    local_dev_get_performancestate_t dev_get_performancestate;
+    void *func = nvml_symtab[NVML_DEVICE_GET_PERFORMANCESTATE].handle;
+
+    if (!func)
+	return NVML_ERROR_FUNCTION_NOT_FOUND;
+    dev_get_performancestate = (local_dev_get_performancestate_t)func;
+    return dev_get_performancestate(device, state);
+}
+
+const char *
+localNvmlErrStr(nvmlReturn_t sts)
+{
+    int i;
+    static const char *unknown = "No such error code";
+    static struct {
+	int		code;
+	const char	*msg;
+    } table[] = { {
+	NVML_SUCCESS,
+"The operation was successful" }, {
+	NVML_ERROR_UNINITIALIZED,
+"NVML was not first initialized with nvmlInit()" }, {
+	NVML_ERROR_INVALID_ARGUMENT,
+"A supplied argument is invalid" }, {
+	NVML_ERROR_NOT_SUPPORTED,
+"The requested operation is not available on target device" }, {
+	NVML_ERROR_NO_PERMISSION,
+"The current user does not have permission for operation" }, {
+	NVML_ERROR_ALREADY_INITIALIZED,
+"Deprecated error code (5)" }, {
+	NVML_ERROR_NOT_FOUND,
+"A query to find an object was unsuccessful" }, {
+	NVML_ERROR_INSUFFICIENT_SIZE,
+"An input argument is not large enough" }, {
+	NVML_ERROR_INSUFFICIENT_POWER,
+"A device's external power cables are not properly attached" }, {
+	NVML_ERROR_DRIVER_NOT_LOADED,
+"NVIDIA driver is not loaded" }, {
+	NVML_ERROR_TIMEOUT,
+"User provided timeout passed" }, {
+	NVML_ERROR_IRQ_ISSUE,
+"NVIDIA Kernel detected an interrupt issue with a GPU" }, {
+	NVML_ERROR_LIBRARY_NOT_FOUND,
+"NVML Shared Library couldn't be found or loaded" }, {
+	NVML_ERROR_FUNCTION_NOT_FOUND,
+"Local version of NVML doesn't implement this function" }, {
+	NVML_ERROR_CORRUPTED_INFOROM,
+"infoROM is corrupted" }, {
+	NVML_ERROR_GPU_IS_LOST,
+"The GPU has fallen off the bus or has otherwise become inaccessible" }, {
+	NVML_ERROR_UNKNOWN,
+"An internal driver error occurred"
+    } };
+
+    for (i = 0; i < (sizeof(table)/sizeof(table[0])); i++) {
+	if (table[i].code == sts)
+	    return table[i].msg;
+    }
+    return unknown;
+}
diff --git a/src/pmdas/nvidia/localnvml.h b/src/pmdas/nvidia/localnvml.h
new file mode 100644
index 0000000..3d108e5
--- /dev/null
+++ b/src/pmdas/nvidia/localnvml.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+#ifndef _LOCAL_NVML_H
+#define _LOCAL_NVML_H
+
+/*
+ * NVML interfaces and data structures, based on:
+ * http://docs.nvidia.com/deploy/nvml-api/index.html
+ */
+
+#define NVML_DEVICE_NAME_BUFFER_SIZE		64
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE	16
+
+typedef void *nvmlDevice_t;	/* used as an opaque handle */
+typedef int nvmlPstates_t;	/* performance state (0-15) */
+
+/* Error codes */
+typedef enum {
+    NVML_SUCCESS			= 0,
+    NVML_ERROR_UNINITIALIZED		= 1,
+    NVML_ERROR_INVALID_ARGUMENT		= 2,
+    NVML_ERROR_NOT_SUPPORTED		= 3,
+    NVML_ERROR_NO_PERMISSION		= 4,
+    NVML_ERROR_ALREADY_INITIALIZED	= 5,
+    NVML_ERROR_NOT_FOUND		= 6,
+    NVML_ERROR_INSUFFICIENT_SIZE	= 7,
+    NVML_ERROR_INSUFFICIENT_POWER	= 8,
+    NVML_ERROR_DRIVER_NOT_LOADED	= 9,
+    NVML_ERROR_TIMEOUT			= 10,
+    NVML_ERROR_IRQ_ISSUE		= 11,
+    NVML_ERROR_LIBRARY_NOT_FOUND	= 12,
+    NVML_ERROR_FUNCTION_NOT_FOUND	= 13,
+    NVML_ERROR_CORRUPTED_INFOROM	= 14,
+    NVML_ERROR_GPU_IS_LOST		= 15,
+    NVML_ERROR_UNKNOWN			= 999
+} nvmlReturn_t;
+
+typedef enum {
+    NVML_TEMPERATURE_GPU		= 0,
+    NVML_TEMPERATURE_COUNT
+} nvmlTemperatureSensors_t;
+
+typedef struct {
+    char		busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    unsigned int	domain;
+    unsigned int	bus;
+    unsigned int	device;
+    unsigned int	pciDeviceId;
+    unsigned int	pciSubSystemId;
+    unsigned int	reserved[4];
+} nvmlPciInfo_t;
+
+typedef struct {
+    unsigned int	gpu;
+    unsigned int	memory;
+} nvmlUtilization_t;
+
+typedef struct {
+    unsigned long long	total;
+    unsigned long long	free;
+    unsigned long long	used;
+} nvmlMemory_t;
+
+extern int localNvmlInit(void);
+extern int localNvmlShutdown(void);
+extern const char *localNvmlErrStr(nvmlReturn_t);
+
+extern int localNvmlDeviceGetCount(unsigned int *);
+extern int localNvmlDeviceGetHandleByIndex(unsigned int, nvmlDevice_t *);
+extern int localNvmlDeviceGetName(nvmlDevice_t, char *, unsigned int);
+extern int localNvmlDeviceGetPciInfo(nvmlDevice_t, nvmlPciInfo_t *);
+extern int localNvmlDeviceGetFanSpeed(nvmlDevice_t, unsigned int *);
+extern int localNvmlDeviceGetTemperature(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
+extern int localNvmlDeviceGetUtilizationRates(nvmlDevice_t, nvmlUtilization_t *);
+extern int localNvmlDeviceGetMemoryInfo(nvmlDevice_t, nvmlMemory_t *);
+extern int localNvmlDeviceGetPerformanceState(nvmlDevice_t, nvmlPstates_t *);
+
+#endif /* _LOCAL_NVML_H */
diff --git a/src/pmdas/nvidia/nvidia.c b/src/pmdas/nvidia/nvidia.c
new file mode 100644
index 0000000..849f51f
--- /dev/null
+++ b/src/pmdas/nvidia/nvidia.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2014 Red Hat.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+#include "pmapi.h"
+#include "impl.h"
+#include "pmda.h"
+#include "domain.h"
+#include "localnvml.h"
+
+/* InDom table (just one row - corresponding to the set of graphics cards) */
+enum { GCARD_INDOM = 0 };
+pmdaIndom indomtab[] = {
+    { GCARD_INDOM, 0, NULL },
+};
+
+/* List of metric item numbers - increasing from zero, no holes */
+enum {
+    NVIDIA_NUMCARDS = 0,
+    NVIDIA_CARDID,
+    NVIDIA_CARDNAME,
+    NVIDIA_BUSID,
+    NVIDIA_TEMP,
+    NVIDIA_FANSPEED,
+    NVIDIA_PERFSTATE,
+    NVIDIA_GPUACTIVE,
+    NVIDIA_MEMACTIVE,
+    NVIDIA_MEMUSED,
+    NVIDIA_MEMTOTAL,
+    NVIDIA_MEMFREE,
+
+    NVIDIA_METRIC_COUNT
+};
+
+/* Table of metrics exported by this PMDA */
+static pmdaMetric metrictab[] = {
+    { NULL, { PMDA_PMID(0, NVIDIA_NUMCARDS), PM_TYPE_U32, PM_INDOM_NULL,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_CARDID), PM_TYPE_U32, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_CARDNAME), PM_TYPE_STRING, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_BUSID), PM_TYPE_STRING, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_TEMP), PM_TYPE_U32, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_FANSPEED), PM_TYPE_U32, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_PERFSTATE), PM_TYPE_U32, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_GPUACTIVE), PM_TYPE_U32, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_MEMACTIVE), PM_TYPE_U32, GCARD_INDOM,
+        PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_MEMUSED), PM_TYPE_U64, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_MEMTOTAL), PM_TYPE_U64, GCARD_INDOM,
+	PM_SEM_DISCRETE, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+    { NULL, { PMDA_PMID(0, NVIDIA_MEMFREE), PM_TYPE_U64, GCARD_INDOM,
+	PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } },
+};
+
+/* GCARD_INDOM struct, stats that are per card */
+typedef struct {
+    int			cardid;
+    int			failed[NVIDIA_METRIC_COUNT];
+    char		*name;
+    char		*busid;
+    int			temp;
+    int			fanspeed;
+    int			perfstate;
+    nvmlUtilization_t	active;
+    nvmlMemory_t	memory;
+} nvinfo_t;
+
+/* overall struct, holds instance values, indom and instance struct arrays */
+typedef struct {
+    int			numcards;
+    int			 maxcards;
+    nvinfo_t		*nvinfo;
+    pmdaIndom		*nvindom;
+} pcp_nvinfo_t;
+
+static pcp_nvinfo_t	pcp_nvinfo;
+static char		mypath[MAXPATHLEN];
+static int		isDSO = 1;
+static int		nvmlDSO_loaded;
+
+static int
+setup_gcard_indom(void)
+{
+    unsigned int	device_count = 0;
+    pmdaIndom		*idp = &indomtab[GCARD_INDOM];
+    char		gpuname[32], *name;
+    size_t		size;
+    int			i, sts;
+
+    /* Initialize instance domain and instances. */
+    if ((sts = localNvmlDeviceGetCount(&device_count)) != NVML_SUCCESS) {
+	__pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s",
+			localNvmlErrStr(sts));
+	return sts;
+    }
+
+    pcp_nvinfo.nvindom = idp;
+    pcp_nvinfo.nvindom->it_numinst = 0;
+
+    size = device_count * sizeof(pmdaInstid);
+    pcp_nvinfo.nvindom->it_set = (pmdaInstid *)malloc(size);
+    if (!pcp_nvinfo.nvindom->it_set) {
+	__pmNoMem("gcard indom", size, PM_RECOV_ERR);
+	return -ENOMEM;
+    }
+
+    size = device_count * sizeof(nvinfo_t);
+    if ((pcp_nvinfo.nvinfo = (nvinfo_t *)malloc(size)) == NULL) {
+	__pmNoMem("gcard values", size, PM_RECOV_ERR);
+	free(pcp_nvinfo.nvindom->it_set);
+	return -ENOMEM;
+    }
+    memset(pcp_nvinfo.nvinfo, 0, size);
+
+    for (i = 0; i < device_count; i++) {
+	pcp_nvinfo.nvindom->it_set[i].i_inst = i;
+	snprintf(gpuname, sizeof(gpuname), "gpu%d", i);
+	if ((name = strdup(gpuname)) == NULL) {
+	    __pmNoMem("gcard instname", strlen(gpuname), PM_RECOV_ERR);
+	    while (--i)
+		free(pcp_nvinfo.nvindom->it_set[i].i_name);
+	    free(pcp_nvinfo.nvindom->it_set);
+	    free(pcp_nvinfo.nvinfo);
+	    return -ENOMEM;
+	}
+	pcp_nvinfo.nvindom->it_set[i].i_name = name;
+    }
+
+    pcp_nvinfo.numcards = 0;
+    pcp_nvinfo.maxcards = device_count;
+    pcp_nvinfo.nvindom->it_numinst = device_count;
+    return 0;
+}
+
+static int
+refresh(pcp_nvinfo_t *pcp_nvinfo)
+{
+    unsigned int	device_count;
+    nvmlDevice_t	device;
+    char		name[NVML_DEVICE_NAME_BUFFER_SIZE];
+    nvmlPciInfo_t	pci;
+    unsigned int	fanspeed;
+    unsigned int	temperature;
+    nvmlUtilization_t	utilization;
+    nvmlMemory_t	memory;
+    nvmlPstates_t	pstate;
+    int			i, sts;
+
+    if (!nvmlDSO_loaded) {
+	if (localNvmlInit() == NVML_ERROR_LIBRARY_NOT_FOUND)
+	    return 0;
+	setup_gcard_indom();
+	nvmlDSO_loaded = 1;
+    }
+
+    if ((sts = localNvmlDeviceGetCount(&device_count)) != 0) {
+	__pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s",
+			localNvmlErrStr(sts));
+	return sts;
+    }
+    pcp_nvinfo->numcards = device_count;
+
+    for (i = 0; i < device_count && i < pcp_nvinfo->maxcards; i++) {
+	pcp_nvinfo->nvinfo[i].cardid = i;
+	if ((sts = localNvmlDeviceGetHandleByIndex(i, &device))) {
+	    __pmNotifyErr(LOG_ERR, "nvmlDeviceGetHandleByIndex: %s",
+			localNvmlErrStr(sts));
+	    memset(pcp_nvinfo->nvinfo[i].failed, 1, NVIDIA_METRIC_COUNT);
+	    continue;
+	}
+	memset(pcp_nvinfo->nvinfo[i].failed, 0, NVIDIA_METRIC_COUNT);
+	if ((sts = localNvmlDeviceGetName(device, name, sizeof(name))))
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_CARDNAME] = 1;
+        if ((sts = localNvmlDeviceGetPciInfo(device, &pci)))
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_BUSID] = 1;
+        if ((sts = localNvmlDeviceGetFanSpeed(device, &fanspeed)))
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_FANSPEED] = 1;
+        if ((sts = localNvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature)))
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_TEMP] = 1;
+        if ((sts = localNvmlDeviceGetUtilizationRates(device, &utilization))) {
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_GPUACTIVE] = 1;
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMACTIVE] = 1;
+	}
+        if ((sts = localNvmlDeviceGetMemoryInfo(device, &memory))) {
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMUSED] = 1;
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMTOTAL] = 1;
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMFREE] = 1;
+	}
+	if ((sts = localNvmlDeviceGetPerformanceState(device, &pstate)))
+	    pcp_nvinfo->nvinfo[i].failed[NVIDIA_PERFSTATE] = 1;
+
+	if (pcp_nvinfo->nvinfo[i].name == NULL)
+	    pcp_nvinfo->nvinfo[i].name = strdup(name);
+	if (pcp_nvinfo->nvinfo[i].busid == NULL)
+	    pcp_nvinfo->nvinfo[i].busid = strdup(pci.busId);
+	pcp_nvinfo->nvinfo[i].temp = temperature;
+	pcp_nvinfo->nvinfo[i].fanspeed = fanspeed;
+	pcp_nvinfo->nvinfo[i].perfstate = pstate;
+	pcp_nvinfo->nvinfo[i].active = utilization;	/* struct copy */
+	pcp_nvinfo->nvinfo[i].memory = memory;		/* struct copy */
+    }
+
+    return 0;
+}
+
+/*
+ * Wrapper for pmdaFetch which refresh the set of values once per fetch
+ * PDU.  The fetchCallback is then called once per-metric/instance pair
+ * to perform the actual filling of the pmResult (via each pmAtomValue).
+ */
+static int
+nvidia_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
+{
+    refresh(&pcp_nvinfo);
+    return pmdaFetch(numpmid, pmidlist, resp, pmda);
+}
+
+static int
+nvidia_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
+{
+    __pmID_int	*idp = (__pmID_int *)&(mdesc->m_desc.pmid);
+
+    if (idp->cluster != 0)
+	return PM_ERR_PMID;
+    if (idp->item != 0 && inst > indomtab[GCARD_INDOM].it_numinst)
+	return PM_ERR_INST;
+
+    switch (idp->item) {
+        case NVIDIA_NUMCARDS:
+            atom->ul = pcp_nvinfo.numcards;
+            break;
+        case NVIDIA_CARDID:
+            atom->ul = pcp_nvinfo.nvinfo[inst].cardid;
+            break;
+        case NVIDIA_CARDNAME:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_CARDNAME])
+		return PM_ERR_VALUE;
+            atom->cp = pcp_nvinfo.nvinfo[inst].name;
+            break;
+        case NVIDIA_BUSID:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_BUSID])
+		return PM_ERR_VALUE;
+            atom->cp = pcp_nvinfo.nvinfo[inst].busid;
+            break;
+        case NVIDIA_TEMP:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_TEMP])
+		return PM_ERR_VALUE;
+            atom->ul = pcp_nvinfo.nvinfo[inst].temp;
+            break;
+        case NVIDIA_FANSPEED:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_FANSPEED])
+		return PM_ERR_VALUE;
+            atom->ul = pcp_nvinfo.nvinfo[inst].fanspeed;
+            break;
+        case NVIDIA_PERFSTATE:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_PERFSTATE])
+		return PM_ERR_VALUE;
+            atom->ul = pcp_nvinfo.nvinfo[inst].perfstate;
+            break;
+        case NVIDIA_GPUACTIVE:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_GPUACTIVE])
+		return PM_ERR_VALUE;
+            atom->ul = pcp_nvinfo.nvinfo[inst].active.gpu;
+            break;
+        case NVIDIA_MEMACTIVE:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMACTIVE])
+		return PM_ERR_VALUE;
+            atom->ul = pcp_nvinfo.nvinfo[inst].active.memory;
+            break;
+        case NVIDIA_MEMUSED:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMUSED])
+		return PM_ERR_VALUE;
+            atom->ull = pcp_nvinfo.nvinfo[inst].memory.used;
+            break;
+        case NVIDIA_MEMTOTAL:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMTOTAL])
+		return PM_ERR_VALUE;
+            atom->ull = pcp_nvinfo.nvinfo[inst].memory.total;
+            break;
+        case NVIDIA_MEMFREE:
+	    if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMFREE])
+		return PM_ERR_VALUE;
+            atom->ull = pcp_nvinfo.nvinfo[inst].memory.free;
+            break;
+        default:
+            return PM_ERR_PMID;
+    }
+
+    return 0;
+}
+
+/**
+ * Initializes the path to the help file for this PMDA.
+ */
+static void
+initializeHelpPath()
+{
+    int sep = __pmPathSeparator();
+    snprintf(mypath, sizeof(mypath), "%s%c" "nvidia" "%c" "help",
+            pmGetConfig("PCP_PMDAS_DIR"), sep, sep);
+}
+
+void 
+__PMDA_INIT_CALL
+nvidia_init(pmdaInterface *dp)
+{
+    int sts;
+
+    if (isDSO) {
+    	initializeHelpPath();
+    	pmdaDSO(dp, PMDA_INTERFACE_2, "nvidia DSO", mypath);
+    }
+
+    if (dp->status != 0)
+	return;
+
+    if ((sts = localNvmlInit()) == NVML_SUCCESS) {
+	setup_gcard_indom();
+	nvmlDSO_loaded = 1;
+    }
+    else {
+	/*
+	 * This is OK, just continue on until it *is* installed;
+	 * until that time, simply report "no values available".
+	 */
+	__pmNotifyErr(LOG_INFO, "NVIDIA NVML library currently unavailable");
+    }
+
+    dp->version.any.fetch = nvidia_fetch;
+    pmdaSetFetchCallBack(dp, nvidia_fetchCallBack);
+
+    pmdaInit(dp, indomtab, sizeof(indomtab)/sizeof(indomtab[0]), 
+	     metrictab, sizeof(metrictab)/sizeof(metrictab[0]));
+}
+
+static pmLongOptions longopts[] = {
+    PMDA_OPTIONS_HEADER("Options"),
+    PMOPT_DEBUG,
+    PMDAOPT_DOMAIN,
+    PMDAOPT_LOGFILE,
+    PMOPT_HELP,
+    PMDA_OPTIONS_END
+};
+
+static pmdaOptions opts = {
+    .short_options = "D:d:l:?",
+    .long_options = longopts,
+};
+
+int
+main(int argc, char **argv)
+{
+    pmdaInterface	desc;
+
+    isDSO = 0;
+    __pmSetProgname(argv[0]);
+
+    initializeHelpPath();
+    pmdaDaemon(&desc, PMDA_INTERFACE_2, pmProgname, NVML,
+		"nvidia.log", mypath);
+
+    pmdaGetOptions(argc, argv, &opts, &desc);
+    if (opts.errors) {
+	pmdaUsageMessage(&opts);
+	exit(1);
+    }
+
+    pmdaOpenLog(&desc);
+    pmdaConnect(&desc);
+    nvidia_init(&desc);
+    pmdaMain(&desc);
+
+    exit(0);
+}
diff --git a/src/pmdas/nvidia/pmns b/src/pmdas/nvidia/pmns
new file mode 100644
index 0000000..b496cae
--- /dev/null
+++ b/src/pmdas/nvidia/pmns
@@ -0,0 +1,30 @@
+/*
+ * Metrics for nvidia GPU PMDA
+ *
+ * Copyright (c) 2014 Red Hat.
+ * 
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+nvidia {
+    numcards			NVML:0:0
+    gpuid			NVML:0:1
+    cardname			NVML:0:2
+    busid			NVML:0:3
+    temp			NVML:0:4
+    fanspeed			NVML:0:5
+    perfstate			NVML:0:6
+    gpuactive			NVML:0:7
+    memactive			NVML:0:8
+    memused			NVML:0:9
+    memtotal			NVML:0:10
+    memfree			NVML:0:11
+}
diff --git a/src/pmdas/nvidia/root b/src/pmdas/nvidia/root
new file mode 100644
index 0000000..fe12bc2
--- /dev/null
+++ b/src/pmdas/nvidia/root
@@ -0,0 +1,10 @@
+/*
+ * fake "root" for validating the local PMNS subtree
+ */
+
+#include <stdpmid>
+
+root { nvidia }
+
+#include "pmns"
+