diff options
Diffstat (limited to 'src/pmdas/nvidia')
-rw-r--r-- | src/pmdas/nvidia/GNUmakefile | 50 | ||||
-rwxr-xr-x | src/pmdas/nvidia/Install | 28 | ||||
-rwxr-xr-x | src/pmdas/nvidia/README | 7 | ||||
-rwxr-xr-x | src/pmdas/nvidia/Remove | 38 | ||||
-rw-r--r-- | src/pmdas/nvidia/help | 72 | ||||
-rw-r--r-- | src/pmdas/nvidia/localnvml.c | 270 | ||||
-rw-r--r-- | src/pmdas/nvidia/localnvml.h | 89 | ||||
-rw-r--r-- | src/pmdas/nvidia/nvidia.c | 391 | ||||
-rw-r--r-- | src/pmdas/nvidia/pmns | 30 | ||||
-rw-r--r-- | src/pmdas/nvidia/root | 10 |
10 files changed, 985 insertions, 0 deletions
diff --git a/src/pmdas/nvidia/GNUmakefile b/src/pmdas/nvidia/GNUmakefile new file mode 100644 index 0000000..622c726 --- /dev/null +++ b/src/pmdas/nvidia/GNUmakefile @@ -0,0 +1,50 @@ +# +# Copyright (c) 2014 Red Hat. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +TOPDIR = ../../.. +include $(TOPDIR)/src/include/builddefs + +IAM = nvidia +DOMAIN = NVML + +CMDTARGET = pmdanvidia$(EXECSUFFIX) +LIBTARGET = pmda_nvidia.$(DSOSUFFIX) +CFILES = localnvml.c nvidia.c +HFILES = localnvml.h +DFILES = README +LSRCFILES = Install Remove root help pmns $(DFILES) +LLDLIBS = $(PCP_PMDALIB) $(LIB_FOR_DLOPEN) +LCFLAGS += -DDSOSUFFIX=\"$(DSOSUFFIX)\" + +PMDADIR = $(PCP_PMDAS_DIR)/$(IAM) +LDIRT = domain.h *.log *.dir *.pag so_locations + +default: $(LIBTARGET) $(CMDTARGET) + +include $(BUILDRULES) + +install: default + $(INSTALL) -m 755 -d $(PMDADIR) + $(INSTALL) -m 755 Install Remove $(PMDADIR) + $(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(PMDADIR) + $(INSTALL) -m 644 $(DFILES) root help pmns domain.h $(PMDADIR) + +nvidia.o: domain.h + +domain.h: ../../pmns/stdpmid + $(DOMAIN_MAKERULE) + +default_pcp: default + +install_pcp: install diff --git a/src/pmdas/nvidia/Install b/src/pmdas/nvidia/Install new file mode 100755 index 0000000..6fd401e --- /dev/null +++ b/src/pmdas/nvidia/Install @@ -0,0 +1,28 @@ +#! /bin/sh +# +# Copyright (c) 1997 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Install the trivial PMDA and/or PMNS +# + +. $PCP_DIR/etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +iam=nvidia +pmda_interface=2 +dso_opt=true +forced_restart=false + +pmdaSetup +pmdaInstall +exit 0 diff --git a/src/pmdas/nvidia/README b/src/pmdas/nvidia/README new file mode 100755 index 0000000..114896d --- /dev/null +++ b/src/pmdas/nvidia/README @@ -0,0 +1,7 @@ +Readme +NVIDIA PMDA +=========== + +The NVIDIA PMDA is a PCP module for gathering metrics on the performance of +NVIDIA graphics cards. It uses the NVIDIA Management Library (NVML) to query +the states of attached cards. diff --git a/src/pmdas/nvidia/Remove b/src/pmdas/nvidia/Remove new file mode 100755 index 0000000..5e28c15 --- /dev/null +++ b/src/pmdas/nvidia/Remove @@ -0,0 +1,38 @@ +#! /bin/sh +# +# Copyright (c) 1997 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# Remove the nvidia PMDA +# + +# source the PCP configuration environment variables +. $PCP_DIR/etc/pcp.env + +# Get the common procedures and variable assignments +# +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +# The name of the PMDA +# +iam=nvidia + +# Do it +# +pmdaSetup +pmdaRemove + +exit 0 diff --git a/src/pmdas/nvidia/help b/src/pmdas/nvidia/help new file mode 100644 index 0000000..38f7b4a --- /dev/null +++ b/src/pmdas/nvidia/help @@ -0,0 +1,72 @@ +# +# Copyright (c) 2014 Red Hat. +# Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# NVIDIA PMDA help file in the ASCII format +# +# lines beginning with a # are ignored +# lines beginning @ introduce a new entry of the form +# @ metric_name oneline-text +# help test goes +# here over multiple lines +# ... +# +# the metric_name is decoded against the default PMNS -- as a special case, +# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an +# instance domain identification, and the text describes the instance domain +# +# blank lines before the @ line are ignored +# + +@ nvidia.numcards Number of Graphics Cards +The number of NVIDIA Graphics cards installed in this system + +@ nvidia.gpuid GPU ID +Zero indexed id of this NVIDIA card + +@ nvidia.cardname GPU Name +The name of the graphics card + +@ nvidia.busid Card Bus ID +The Bus ID as reported by the NVIDIA tools, not lspci + +@ nvidia.temp The temperature of the card +The Temperature of the GPU on the NVIDIA card in degrees celcius. + +@ nvidia.fanspeed Fanspeed +Speed of the GPU fan as a percentage of the maximum + +@ nvidia.perfstate NVIDIA performance state +The PX performance state as reported from NVML. Value is an integer +which should range from 0 (maximum performance) to 15 (minimum). If +the state is unknown the reported value will be 32, however. + +@ nvidia.gpuactive Percentage of GPU utilization +Percentage of time over the past sample period during which one or more +kernels was executing on the GPU. + +@ nvidia.memactive Percentage of time spent accessing memory +Percent of time over the past sample period during which global (device) +memory was being read or written. This metric shows if the memory is +actively being accesed, and is not correlated to storage amount used. + +@ nvidia.memused Allocated FB memory +Amount of GPU FB memory that has currently been allocated, in bytes. +Note that the driver/GPU always sets aside a small amount of memory +for bookkeeping. + +@ nvidia.memtotal Total FB memory available +The total amount of GPU FB memory available on the card, in bytes. + +@ nvidia.memfree Unallocated FB memory +Amount of GPU FB memory that is not currently allocated, in bytes. diff --git a/src/pmdas/nvidia/localnvml.c b/src/pmdas/nvidia/localnvml.c new file mode 100644 index 0000000..2cadeb9 --- /dev/null +++ b/src/pmdas/nvidia/localnvml.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2014 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#include "pmapi.h" +#include "impl.h" +#if defined(HAVE_DLFCN_H) +#include <dlfcn.h> +#endif +#include "localnvml.h" + +/* + * Implements NVML interfaces based on: + * http://docs.nvidia.com/deploy/nvml-api/index.html + * ... using either a dlopen'd 3rd party or "no values available". + */ + +struct { + const char *symbol; + void *handle; +} nvml_symtab[] = { + { .symbol = "nvmlInit" }, + { .symbol = "nvmlShutdown" }, + { .symbol = "nvmlDeviceGetCount" }, + { .symbol = "nvmlDeviceGetHandleByIndex" }, + { .symbol = "nvmlDeviceGetName" }, + { .symbol = "nvmlDeviceGetPciInfo" }, + { .symbol = "nvmlDeviceGetFanSpeed" }, + { .symbol = "nvmlDeviceGetTemperature" }, + { .symbol = "nvmlDeviceGetUtilizationRates" }, + { .symbol = "nvmlDeviceGetMemoryInfo" }, + { .symbol = "nvmlDeviceGetPerformanceState" }, +}; +enum { + NVML_INIT, + NVML_SHUTDOWN, + NVML_DEVICE_GET_COUNT, + NVML_DEVICE_GET_HANDLEBYINDEX, + NVML_DEVICE_GET_NAME, + NVML_DEVICE_GET_PCIINFO, + NVML_DEVICE_GET_FANSPEED, + NVML_DEVICE_GET_TEMPERATURE, + NVML_DEVICE_GET_UTILIZATIONRATES, + NVML_DEVICE_GET_MEMORYINFO, + NVML_DEVICE_GET_PERFORMANCESTATE, + NVML_SYMBOL_COUNT +}; +typedef int (*local_init_t)(void); +typedef int (*local_shutdown_t)(void); +typedef int (*local_dev_get_count_t)(unsigned int *); +typedef int (*local_dev_get_handlebyindex_t)(unsigned int, nvmlDevice_t *); +typedef int (*local_dev_get_name_t)(nvmlDevice_t, char *, unsigned int); +typedef int (*local_dev_get_pciinfo_t)(nvmlDevice_t, nvmlPciInfo_t *); +typedef int (*local_dev_get_fanspeed_t)(nvmlDevice_t, unsigned int *); +typedef int (*local_dev_get_temperature_t)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *); +typedef int (*local_dev_get_utilizationrates_t)(nvmlDevice_t, nvmlUtilization_t *); +typedef int (*local_dev_get_memoryinfo_t)(nvmlDevice_t, nvmlMemory_t *); +typedef int (*local_dev_get_performancestate_t)(nvmlDevice_t, nvmlPstates_t *); + +static int +resolve_symbols(void) +{ + static void *nvml_dso; + int i; + + if (nvml_dso != NULL) + return 0; + if ((nvml_dso = dlopen("libnvidia-ml." DSOSUFFIX, RTLD_NOW)) == NULL) + return NVML_ERROR_LIBRARY_NOT_FOUND; + __pmNotifyErr(LOG_INFO, "Successfully loaded NVIDIA NVML library"); + for (i = 0; i < NVML_SYMBOL_COUNT; i++) + nvml_symtab[i].handle = dlsym(nvml_dso, nvml_symtab[i].symbol); + return 0; +} + +int +localNvmlInit(void) +{ + local_init_t init; + void *func; + int sts = resolve_symbols(); + + if (sts != 0) + return sts; + if ((func = nvml_symtab[NVML_INIT].handle) == NULL) + return NVML_ERROR_FUNCTION_NOT_FOUND; + init = (local_init_t)func; + return init(); +} + +int +localNvmlShutdown(void) +{ + local_shutdown_t shutdown; + void *func = nvml_symtab[NVML_SHUTDOWN].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + shutdown = (local_shutdown_t)func; + return shutdown(); +} + +int +localNvmlDeviceGetCount(unsigned int *count) +{ + local_dev_get_count_t dev_get_count; + void *func = nvml_symtab[NVML_DEVICE_GET_COUNT].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_count = (local_dev_get_count_t)func; + return dev_get_count(count); +} + +int +localNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) +{ + local_dev_get_handlebyindex_t dev_get_handlebyindex; + void *func = nvml_symtab[NVML_DEVICE_GET_HANDLEBYINDEX].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_handlebyindex = (local_dev_get_handlebyindex_t)func; + return dev_get_handlebyindex(index, device); +} + +int +localNvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int size) +{ + local_dev_get_name_t dev_get_name; + void *func = nvml_symtab[NVML_DEVICE_GET_NAME].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_name = (local_dev_get_name_t)func; + return dev_get_name(device, name, size); +} + +int +localNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *info) +{ + local_dev_get_pciinfo_t dev_get_pciinfo; + void *func = nvml_symtab[NVML_DEVICE_GET_PCIINFO].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_pciinfo = (local_dev_get_pciinfo_t)func; + return dev_get_pciinfo(device, info); +} + +int +localNvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed) +{ + local_dev_get_fanspeed_t dev_get_fanspeed; + void *func = nvml_symtab[NVML_DEVICE_GET_FANSPEED].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_fanspeed = (local_dev_get_fanspeed_t)func; + return dev_get_fanspeed(device, speed); +} + +int +localNvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t code, unsigned int *temp) +{ + local_dev_get_temperature_t dev_get_temperature; + void *func = nvml_symtab[NVML_DEVICE_GET_TEMPERATURE].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_temperature = (local_dev_get_temperature_t)func; + return dev_get_temperature(device, code, temp); +} + +int +localNvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *util) +{ + local_dev_get_utilizationrates_t dev_get_utilizationrates; + void *func = nvml_symtab[NVML_DEVICE_GET_UTILIZATIONRATES].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_utilizationrates = (local_dev_get_utilizationrates_t)func; + return dev_get_utilizationrates(device, util); +} + +int +localNvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory) +{ + local_dev_get_memoryinfo_t dev_get_memoryinfo; + void *func = nvml_symtab[NVML_DEVICE_GET_MEMORYINFO].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_memoryinfo = (local_dev_get_memoryinfo_t)func; + return dev_get_memoryinfo(device, memory); +} + +int +localNvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *state) +{ + local_dev_get_performancestate_t dev_get_performancestate; + void *func = nvml_symtab[NVML_DEVICE_GET_PERFORMANCESTATE].handle; + + if (!func) + return NVML_ERROR_FUNCTION_NOT_FOUND; + dev_get_performancestate = (local_dev_get_performancestate_t)func; + return dev_get_performancestate(device, state); +} + +const char * +localNvmlErrStr(nvmlReturn_t sts) +{ + int i; + static const char *unknown = "No such error code"; + static struct { + int code; + const char *msg; + } table[] = { { + NVML_SUCCESS, +"The operation was successful" }, { + NVML_ERROR_UNINITIALIZED, +"NVML was not first initialized with nvmlInit()" }, { + NVML_ERROR_INVALID_ARGUMENT, +"A supplied argument is invalid" }, { + NVML_ERROR_NOT_SUPPORTED, +"The requested operation is not available on target device" }, { + NVML_ERROR_NO_PERMISSION, +"The current user does not have permission for operation" }, { + NVML_ERROR_ALREADY_INITIALIZED, +"Deprecated error code (5)" }, { + NVML_ERROR_NOT_FOUND, +"A query to find an object was unsuccessful" }, { + NVML_ERROR_INSUFFICIENT_SIZE, +"An input argument is not large enough" }, { + NVML_ERROR_INSUFFICIENT_POWER, +"A device's external power cables are not properly attached" }, { + NVML_ERROR_DRIVER_NOT_LOADED, +"NVIDIA driver is not loaded" }, { + NVML_ERROR_TIMEOUT, +"User provided timeout passed" }, { + NVML_ERROR_IRQ_ISSUE, +"NVIDIA Kernel detected an interrupt issue with a GPU" }, { + NVML_ERROR_LIBRARY_NOT_FOUND, +"NVML Shared Library couldn't be found or loaded" }, { + NVML_ERROR_FUNCTION_NOT_FOUND, +"Local version of NVML doesn't implement this function" }, { + NVML_ERROR_CORRUPTED_INFOROM, +"infoROM is corrupted" }, { + NVML_ERROR_GPU_IS_LOST, +"The GPU has fallen off the bus or has otherwise become inaccessible" }, { + NVML_ERROR_UNKNOWN, +"An internal driver error occurred" + } }; + + for (i = 0; i < (sizeof(table)/sizeof(table[0])); i++) { + if (table[i].code == sts) + return table[i].msg; + } + return unknown; +} diff --git a/src/pmdas/nvidia/localnvml.h b/src/pmdas/nvidia/localnvml.h new file mode 100644 index 0000000..3d108e5 --- /dev/null +++ b/src/pmdas/nvidia/localnvml.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#ifndef _LOCAL_NVML_H +#define _LOCAL_NVML_H + +/* + * NVML interfaces and data structures, based on: + * http://docs.nvidia.com/deploy/nvml-api/index.html + */ + +#define NVML_DEVICE_NAME_BUFFER_SIZE 64 +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 + +typedef void *nvmlDevice_t; /* used as an opaque handle */ +typedef int nvmlPstates_t; /* performance state (0-15) */ + +/* Error codes */ +typedef enum { + NVML_SUCCESS = 0, + NVML_ERROR_UNINITIALIZED = 1, + NVML_ERROR_INVALID_ARGUMENT = 2, + NVML_ERROR_NOT_SUPPORTED = 3, + NVML_ERROR_NO_PERMISSION = 4, + NVML_ERROR_ALREADY_INITIALIZED = 5, + NVML_ERROR_NOT_FOUND = 6, + NVML_ERROR_INSUFFICIENT_SIZE = 7, + NVML_ERROR_INSUFFICIENT_POWER = 8, + NVML_ERROR_DRIVER_NOT_LOADED = 9, + NVML_ERROR_TIMEOUT = 10, + NVML_ERROR_IRQ_ISSUE = 11, + NVML_ERROR_LIBRARY_NOT_FOUND = 12, + NVML_ERROR_FUNCTION_NOT_FOUND = 13, + NVML_ERROR_CORRUPTED_INFOROM = 14, + NVML_ERROR_GPU_IS_LOST = 15, + NVML_ERROR_UNKNOWN = 999 +} nvmlReturn_t; + +typedef enum { + NVML_TEMPERATURE_GPU = 0, + NVML_TEMPERATURE_COUNT +} nvmlTemperatureSensors_t; + +typedef struct { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + unsigned int domain; + unsigned int bus; + unsigned int device; + unsigned int pciDeviceId; + unsigned int pciSubSystemId; + unsigned int reserved[4]; +} nvmlPciInfo_t; + +typedef struct { + unsigned int gpu; + unsigned int memory; +} nvmlUtilization_t; + +typedef struct { + unsigned long long total; + unsigned long long free; + unsigned long long used; +} nvmlMemory_t; + +extern int localNvmlInit(void); +extern int localNvmlShutdown(void); +extern const char *localNvmlErrStr(nvmlReturn_t); + +extern int localNvmlDeviceGetCount(unsigned int *); +extern int localNvmlDeviceGetHandleByIndex(unsigned int, nvmlDevice_t *); +extern int localNvmlDeviceGetName(nvmlDevice_t, char *, unsigned int); +extern int localNvmlDeviceGetPciInfo(nvmlDevice_t, nvmlPciInfo_t *); +extern int localNvmlDeviceGetFanSpeed(nvmlDevice_t, unsigned int *); +extern int localNvmlDeviceGetTemperature(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *); +extern int localNvmlDeviceGetUtilizationRates(nvmlDevice_t, nvmlUtilization_t *); +extern int localNvmlDeviceGetMemoryInfo(nvmlDevice_t, nvmlMemory_t *); +extern int localNvmlDeviceGetPerformanceState(nvmlDevice_t, nvmlPstates_t *); + +#endif /* _LOCAL_NVML_H */ diff --git a/src/pmdas/nvidia/nvidia.c b/src/pmdas/nvidia/nvidia.c new file mode 100644 index 0000000..849f51f --- /dev/null +++ b/src/pmdas/nvidia/nvidia.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2014 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ +#include "pmapi.h" +#include "impl.h" +#include "pmda.h" +#include "domain.h" +#include "localnvml.h" + +/* InDom table (just one row - corresponding to the set of graphics cards) */ +enum { GCARD_INDOM = 0 }; +pmdaIndom indomtab[] = { + { GCARD_INDOM, 0, NULL }, +}; + +/* List of metric item numbers - increasing from zero, no holes */ +enum { + NVIDIA_NUMCARDS = 0, + NVIDIA_CARDID, + NVIDIA_CARDNAME, + NVIDIA_BUSID, + NVIDIA_TEMP, + NVIDIA_FANSPEED, + NVIDIA_PERFSTATE, + NVIDIA_GPUACTIVE, + NVIDIA_MEMACTIVE, + NVIDIA_MEMUSED, + NVIDIA_MEMTOTAL, + NVIDIA_MEMFREE, + + NVIDIA_METRIC_COUNT +}; + +/* Table of metrics exported by this PMDA */ +static pmdaMetric metrictab[] = { + { NULL, { PMDA_PMID(0, NVIDIA_NUMCARDS), PM_TYPE_U32, PM_INDOM_NULL, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_CARDID), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_CARDNAME), PM_TYPE_STRING, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_BUSID), PM_TYPE_STRING, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_TEMP), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_FANSPEED), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_PERFSTATE), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_GPUACTIVE), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_MEMACTIVE), PM_TYPE_U32, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(0, 0, 0, 0, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_MEMUSED), PM_TYPE_U64, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_MEMTOTAL), PM_TYPE_U64, GCARD_INDOM, + PM_SEM_DISCRETE, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } }, + { NULL, { PMDA_PMID(0, NVIDIA_MEMFREE), PM_TYPE_U64, GCARD_INDOM, + PM_SEM_INSTANT, PMDA_PMUNITS(1, 0, 0, PM_SPACE_BYTE, 0, 0) } }, +}; + +/* GCARD_INDOM struct, stats that are per card */ +typedef struct { + int cardid; + int failed[NVIDIA_METRIC_COUNT]; + char *name; + char *busid; + int temp; + int fanspeed; + int perfstate; + nvmlUtilization_t active; + nvmlMemory_t memory; +} nvinfo_t; + +/* overall struct, holds instance values, indom and instance struct arrays */ +typedef struct { + int numcards; + int maxcards; + nvinfo_t *nvinfo; + pmdaIndom *nvindom; +} pcp_nvinfo_t; + +static pcp_nvinfo_t pcp_nvinfo; +static char mypath[MAXPATHLEN]; +static int isDSO = 1; +static int nvmlDSO_loaded; + +static int +setup_gcard_indom(void) +{ + unsigned int device_count = 0; + pmdaIndom *idp = &indomtab[GCARD_INDOM]; + char gpuname[32], *name; + size_t size; + int i, sts; + + /* Initialize instance domain and instances. */ + if ((sts = localNvmlDeviceGetCount(&device_count)) != NVML_SUCCESS) { + __pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s", + localNvmlErrStr(sts)); + return sts; + } + + pcp_nvinfo.nvindom = idp; + pcp_nvinfo.nvindom->it_numinst = 0; + + size = device_count * sizeof(pmdaInstid); + pcp_nvinfo.nvindom->it_set = (pmdaInstid *)malloc(size); + if (!pcp_nvinfo.nvindom->it_set) { + __pmNoMem("gcard indom", size, PM_RECOV_ERR); + return -ENOMEM; + } + + size = device_count * sizeof(nvinfo_t); + if ((pcp_nvinfo.nvinfo = (nvinfo_t *)malloc(size)) == NULL) { + __pmNoMem("gcard values", size, PM_RECOV_ERR); + free(pcp_nvinfo.nvindom->it_set); + return -ENOMEM; + } + memset(pcp_nvinfo.nvinfo, 0, size); + + for (i = 0; i < device_count; i++) { + pcp_nvinfo.nvindom->it_set[i].i_inst = i; + snprintf(gpuname, sizeof(gpuname), "gpu%d", i); + if ((name = strdup(gpuname)) == NULL) { + __pmNoMem("gcard instname", strlen(gpuname), PM_RECOV_ERR); + while (--i) + free(pcp_nvinfo.nvindom->it_set[i].i_name); + free(pcp_nvinfo.nvindom->it_set); + free(pcp_nvinfo.nvinfo); + return -ENOMEM; + } + pcp_nvinfo.nvindom->it_set[i].i_name = name; + } + + pcp_nvinfo.numcards = 0; + pcp_nvinfo.maxcards = device_count; + pcp_nvinfo.nvindom->it_numinst = device_count; + return 0; +} + +static int +refresh(pcp_nvinfo_t *pcp_nvinfo) +{ + unsigned int device_count; + nvmlDevice_t device; + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + nvmlPciInfo_t pci; + unsigned int fanspeed; + unsigned int temperature; + nvmlUtilization_t utilization; + nvmlMemory_t memory; + nvmlPstates_t pstate; + int i, sts; + + if (!nvmlDSO_loaded) { + if (localNvmlInit() == NVML_ERROR_LIBRARY_NOT_FOUND) + return 0; + setup_gcard_indom(); + nvmlDSO_loaded = 1; + } + + if ((sts = localNvmlDeviceGetCount(&device_count)) != 0) { + __pmNotifyErr(LOG_ERR, "nvmlDeviceGetCount: %s", + localNvmlErrStr(sts)); + return sts; + } + pcp_nvinfo->numcards = device_count; + + for (i = 0; i < device_count && i < pcp_nvinfo->maxcards; i++) { + pcp_nvinfo->nvinfo[i].cardid = i; + if ((sts = localNvmlDeviceGetHandleByIndex(i, &device))) { + __pmNotifyErr(LOG_ERR, "nvmlDeviceGetHandleByIndex: %s", + localNvmlErrStr(sts)); + memset(pcp_nvinfo->nvinfo[i].failed, 1, NVIDIA_METRIC_COUNT); + continue; + } + memset(pcp_nvinfo->nvinfo[i].failed, 0, NVIDIA_METRIC_COUNT); + if ((sts = localNvmlDeviceGetName(device, name, sizeof(name)))) + pcp_nvinfo->nvinfo[i].failed[NVIDIA_CARDNAME] = 1; + if ((sts = localNvmlDeviceGetPciInfo(device, &pci))) + pcp_nvinfo->nvinfo[i].failed[NVIDIA_BUSID] = 1; + if ((sts = localNvmlDeviceGetFanSpeed(device, &fanspeed))) + pcp_nvinfo->nvinfo[i].failed[NVIDIA_FANSPEED] = 1; + if ((sts = localNvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature))) + pcp_nvinfo->nvinfo[i].failed[NVIDIA_TEMP] = 1; + if ((sts = localNvmlDeviceGetUtilizationRates(device, &utilization))) { + pcp_nvinfo->nvinfo[i].failed[NVIDIA_GPUACTIVE] = 1; + pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMACTIVE] = 1; + } + if ((sts = localNvmlDeviceGetMemoryInfo(device, &memory))) { + pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMUSED] = 1; + pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMTOTAL] = 1; + pcp_nvinfo->nvinfo[i].failed[NVIDIA_MEMFREE] = 1; + } + if ((sts = localNvmlDeviceGetPerformanceState(device, &pstate))) + pcp_nvinfo->nvinfo[i].failed[NVIDIA_PERFSTATE] = 1; + + if (pcp_nvinfo->nvinfo[i].name == NULL) + pcp_nvinfo->nvinfo[i].name = strdup(name); + if (pcp_nvinfo->nvinfo[i].busid == NULL) + pcp_nvinfo->nvinfo[i].busid = strdup(pci.busId); + pcp_nvinfo->nvinfo[i].temp = temperature; + pcp_nvinfo->nvinfo[i].fanspeed = fanspeed; + pcp_nvinfo->nvinfo[i].perfstate = pstate; + pcp_nvinfo->nvinfo[i].active = utilization; /* struct copy */ + pcp_nvinfo->nvinfo[i].memory = memory; /* struct copy */ + } + + return 0; +} + +/* + * Wrapper for pmdaFetch which refresh the set of values once per fetch + * PDU. The fetchCallback is then called once per-metric/instance pair + * to perform the actual filling of the pmResult (via each pmAtomValue). + */ +static int +nvidia_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda) +{ + refresh(&pcp_nvinfo); + return pmdaFetch(numpmid, pmidlist, resp, pmda); +} + +static int +nvidia_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom) +{ + __pmID_int *idp = (__pmID_int *)&(mdesc->m_desc.pmid); + + if (idp->cluster != 0) + return PM_ERR_PMID; + if (idp->item != 0 && inst > indomtab[GCARD_INDOM].it_numinst) + return PM_ERR_INST; + + switch (idp->item) { + case NVIDIA_NUMCARDS: + atom->ul = pcp_nvinfo.numcards; + break; + case NVIDIA_CARDID: + atom->ul = pcp_nvinfo.nvinfo[inst].cardid; + break; + case NVIDIA_CARDNAME: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_CARDNAME]) + return PM_ERR_VALUE; + atom->cp = pcp_nvinfo.nvinfo[inst].name; + break; + case NVIDIA_BUSID: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_BUSID]) + return PM_ERR_VALUE; + atom->cp = pcp_nvinfo.nvinfo[inst].busid; + break; + case NVIDIA_TEMP: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_TEMP]) + return PM_ERR_VALUE; + atom->ul = pcp_nvinfo.nvinfo[inst].temp; + break; + case NVIDIA_FANSPEED: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_FANSPEED]) + return PM_ERR_VALUE; + atom->ul = pcp_nvinfo.nvinfo[inst].fanspeed; + break; + case NVIDIA_PERFSTATE: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_PERFSTATE]) + return PM_ERR_VALUE; + atom->ul = pcp_nvinfo.nvinfo[inst].perfstate; + break; + case NVIDIA_GPUACTIVE: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_GPUACTIVE]) + return PM_ERR_VALUE; + atom->ul = pcp_nvinfo.nvinfo[inst].active.gpu; + break; + case NVIDIA_MEMACTIVE: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMACTIVE]) + return PM_ERR_VALUE; + atom->ul = pcp_nvinfo.nvinfo[inst].active.memory; + break; + case NVIDIA_MEMUSED: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMUSED]) + return PM_ERR_VALUE; + atom->ull = pcp_nvinfo.nvinfo[inst].memory.used; + break; + case NVIDIA_MEMTOTAL: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMTOTAL]) + return PM_ERR_VALUE; + atom->ull = pcp_nvinfo.nvinfo[inst].memory.total; + break; + case NVIDIA_MEMFREE: + if (pcp_nvinfo.nvinfo[inst].failed[NVIDIA_MEMFREE]) + return PM_ERR_VALUE; + atom->ull = pcp_nvinfo.nvinfo[inst].memory.free; + break; + default: + return PM_ERR_PMID; + } + + return 0; +} + +/** + * Initializes the path to the help file for this PMDA. + */ +static void +initializeHelpPath() +{ + int sep = __pmPathSeparator(); + snprintf(mypath, sizeof(mypath), "%s%c" "nvidia" "%c" "help", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); +} + +void +__PMDA_INIT_CALL +nvidia_init(pmdaInterface *dp) +{ + int sts; + + if (isDSO) { + initializeHelpPath(); + pmdaDSO(dp, PMDA_INTERFACE_2, "nvidia DSO", mypath); + } + + if (dp->status != 0) + return; + + if ((sts = localNvmlInit()) == NVML_SUCCESS) { + setup_gcard_indom(); + nvmlDSO_loaded = 1; + } + else { + /* + * This is OK, just continue on until it *is* installed; + * until that time, simply report "no values available". + */ + __pmNotifyErr(LOG_INFO, "NVIDIA NVML library currently unavailable"); + } + + dp->version.any.fetch = nvidia_fetch; + pmdaSetFetchCallBack(dp, nvidia_fetchCallBack); + + pmdaInit(dp, indomtab, sizeof(indomtab)/sizeof(indomtab[0]), + metrictab, sizeof(metrictab)/sizeof(metrictab[0])); +} + +static pmLongOptions longopts[] = { + PMDA_OPTIONS_HEADER("Options"), + PMOPT_DEBUG, + PMDAOPT_DOMAIN, + PMDAOPT_LOGFILE, + PMOPT_HELP, + PMDA_OPTIONS_END +}; + +static pmdaOptions opts = { + .short_options = "D:d:l:?", + .long_options = longopts, +}; + +int +main(int argc, char **argv) +{ + pmdaInterface desc; + + isDSO = 0; + __pmSetProgname(argv[0]); + + initializeHelpPath(); + pmdaDaemon(&desc, PMDA_INTERFACE_2, pmProgname, NVML, + "nvidia.log", mypath); + + pmdaGetOptions(argc, argv, &opts, &desc); + if (opts.errors) { + pmdaUsageMessage(&opts); + exit(1); + } + + pmdaOpenLog(&desc); + pmdaConnect(&desc); + nvidia_init(&desc); + pmdaMain(&desc); + + exit(0); +} diff --git a/src/pmdas/nvidia/pmns b/src/pmdas/nvidia/pmns new file mode 100644 index 0000000..b496cae --- /dev/null +++ b/src/pmdas/nvidia/pmns @@ -0,0 +1,30 @@ +/* + * Metrics for nvidia GPU PMDA + * + * Copyright (c) 2014 Red Hat. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +nvidia { + numcards NVML:0:0 + gpuid NVML:0:1 + cardname NVML:0:2 + busid NVML:0:3 + temp NVML:0:4 + fanspeed NVML:0:5 + perfstate NVML:0:6 + gpuactive NVML:0:7 + memactive NVML:0:8 + memused NVML:0:9 + memtotal NVML:0:10 + memfree NVML:0:11 +} diff --git a/src/pmdas/nvidia/root b/src/pmdas/nvidia/root new file mode 100644 index 0000000..fe12bc2 --- /dev/null +++ b/src/pmdas/nvidia/root @@ -0,0 +1,10 @@ +/* + * fake "root" for validating the local PMNS subtree + */ + +#include <stdpmid> + +root { nvidia } + +#include "pmns" + |