summaryrefslogtreecommitdiff
path: root/src/pmdas/nvidia/localnvml.c
blob: 2cadeb958081ba180a09feb5184ab166227903b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
/*
 * Copyright (c) 2014 Red Hat.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2 of the License, or (at your
 * option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 */
#include "pmapi.h"
#include "impl.h"
#if defined(HAVE_DLFCN_H)
#include <dlfcn.h>
#endif
#include "localnvml.h"

/*
 * Implements NVML interfaces based on:
 * http://docs.nvidia.com/deploy/nvml-api/index.html
 * ... using either a dlopen'd 3rd party or "no values available".
 */

struct {
    const char	*symbol;
    void	*handle;
} nvml_symtab[] = {
    { .symbol = "nvmlInit" },
    { .symbol = "nvmlShutdown" },
    { .symbol = "nvmlDeviceGetCount" },
    { .symbol = "nvmlDeviceGetHandleByIndex" },
    { .symbol = "nvmlDeviceGetName" },
    { .symbol = "nvmlDeviceGetPciInfo" },
    { .symbol = "nvmlDeviceGetFanSpeed" },
    { .symbol = "nvmlDeviceGetTemperature" },
    { .symbol = "nvmlDeviceGetUtilizationRates" },
    { .symbol = "nvmlDeviceGetMemoryInfo" },
    { .symbol = "nvmlDeviceGetPerformanceState" },
};
enum {
    NVML_INIT,
    NVML_SHUTDOWN,
    NVML_DEVICE_GET_COUNT,
    NVML_DEVICE_GET_HANDLEBYINDEX,
    NVML_DEVICE_GET_NAME,
    NVML_DEVICE_GET_PCIINFO,
    NVML_DEVICE_GET_FANSPEED,
    NVML_DEVICE_GET_TEMPERATURE,
    NVML_DEVICE_GET_UTILIZATIONRATES,
    NVML_DEVICE_GET_MEMORYINFO,
    NVML_DEVICE_GET_PERFORMANCESTATE,
    NVML_SYMBOL_COUNT
};
typedef int (*local_init_t)(void);
typedef int (*local_shutdown_t)(void);
typedef int (*local_dev_get_count_t)(unsigned int *);
typedef int (*local_dev_get_handlebyindex_t)(unsigned int, nvmlDevice_t *);
typedef int (*local_dev_get_name_t)(nvmlDevice_t, char *, unsigned int);
typedef int (*local_dev_get_pciinfo_t)(nvmlDevice_t, nvmlPciInfo_t *);
typedef int (*local_dev_get_fanspeed_t)(nvmlDevice_t, unsigned int *);
typedef int (*local_dev_get_temperature_t)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
typedef int (*local_dev_get_utilizationrates_t)(nvmlDevice_t, nvmlUtilization_t *);
typedef int (*local_dev_get_memoryinfo_t)(nvmlDevice_t, nvmlMemory_t *);
typedef int (*local_dev_get_performancestate_t)(nvmlDevice_t, nvmlPstates_t *);

static int
resolve_symbols(void)
{
    static void *nvml_dso;
    int i;

    if (nvml_dso != NULL)
	return 0;
    if ((nvml_dso = dlopen("libnvidia-ml." DSOSUFFIX, RTLD_NOW)) == NULL)
	return NVML_ERROR_LIBRARY_NOT_FOUND;
    __pmNotifyErr(LOG_INFO, "Successfully loaded NVIDIA NVML library");
    for (i = 0; i < NVML_SYMBOL_COUNT; i++)
	nvml_symtab[i].handle = dlsym(nvml_dso, nvml_symtab[i].symbol);
    return 0;
}

int
localNvmlInit(void)
{
    local_init_t init;
    void *func;
    int sts = resolve_symbols();

    if (sts != 0)
	return sts;
    if ((func = nvml_symtab[NVML_INIT].handle) == NULL)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    init = (local_init_t)func;
    return init();
}

int
localNvmlShutdown(void)
{
    local_shutdown_t shutdown;
    void *func = nvml_symtab[NVML_SHUTDOWN].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    shutdown = (local_shutdown_t)func;
    return shutdown();
}

int
localNvmlDeviceGetCount(unsigned int *count)
{
    local_dev_get_count_t dev_get_count;
    void *func = nvml_symtab[NVML_DEVICE_GET_COUNT].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_count = (local_dev_get_count_t)func;
    return dev_get_count(count);
}

int
localNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)
{
    local_dev_get_handlebyindex_t dev_get_handlebyindex;
    void *func = nvml_symtab[NVML_DEVICE_GET_HANDLEBYINDEX].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_handlebyindex = (local_dev_get_handlebyindex_t)func;
    return dev_get_handlebyindex(index, device);
}

int
localNvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int size)
{
    local_dev_get_name_t dev_get_name;
    void *func = nvml_symtab[NVML_DEVICE_GET_NAME].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_name = (local_dev_get_name_t)func;
    return dev_get_name(device, name, size);
}

int
localNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *info)
{
    local_dev_get_pciinfo_t dev_get_pciinfo;
    void *func = nvml_symtab[NVML_DEVICE_GET_PCIINFO].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_pciinfo = (local_dev_get_pciinfo_t)func;
    return dev_get_pciinfo(device, info);
}

int
localNvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed)
{
    local_dev_get_fanspeed_t dev_get_fanspeed;
    void *func = nvml_symtab[NVML_DEVICE_GET_FANSPEED].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_fanspeed = (local_dev_get_fanspeed_t)func;
    return dev_get_fanspeed(device, speed);
}

int
localNvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t code, unsigned int *temp)
{
    local_dev_get_temperature_t dev_get_temperature;
    void *func = nvml_symtab[NVML_DEVICE_GET_TEMPERATURE].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_temperature = (local_dev_get_temperature_t)func;
    return dev_get_temperature(device, code, temp);
}

int
localNvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *util)
{
    local_dev_get_utilizationrates_t dev_get_utilizationrates;
    void *func = nvml_symtab[NVML_DEVICE_GET_UTILIZATIONRATES].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_utilizationrates = (local_dev_get_utilizationrates_t)func;
    return dev_get_utilizationrates(device, util);
}

int
localNvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory)
{
    local_dev_get_memoryinfo_t dev_get_memoryinfo;
    void *func = nvml_symtab[NVML_DEVICE_GET_MEMORYINFO].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_memoryinfo = (local_dev_get_memoryinfo_t)func;
    return dev_get_memoryinfo(device, memory);
}

int
localNvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *state)
{
    local_dev_get_performancestate_t dev_get_performancestate;
    void *func = nvml_symtab[NVML_DEVICE_GET_PERFORMANCESTATE].handle;

    if (!func)
	return NVML_ERROR_FUNCTION_NOT_FOUND;
    dev_get_performancestate = (local_dev_get_performancestate_t)func;
    return dev_get_performancestate(device, state);
}

const char *
localNvmlErrStr(nvmlReturn_t sts)
{
    int i;
    static const char *unknown = "No such error code";
    static struct {
	int		code;
	const char	*msg;
    } table[] = { {
	NVML_SUCCESS,
"The operation was successful" }, {
	NVML_ERROR_UNINITIALIZED,
"NVML was not first initialized with nvmlInit()" }, {
	NVML_ERROR_INVALID_ARGUMENT,
"A supplied argument is invalid" }, {
	NVML_ERROR_NOT_SUPPORTED,
"The requested operation is not available on target device" }, {
	NVML_ERROR_NO_PERMISSION,
"The current user does not have permission for operation" }, {
	NVML_ERROR_ALREADY_INITIALIZED,
"Deprecated error code (5)" }, {
	NVML_ERROR_NOT_FOUND,
"A query to find an object was unsuccessful" }, {
	NVML_ERROR_INSUFFICIENT_SIZE,
"An input argument is not large enough" }, {
	NVML_ERROR_INSUFFICIENT_POWER,
"A device's external power cables are not properly attached" }, {
	NVML_ERROR_DRIVER_NOT_LOADED,
"NVIDIA driver is not loaded" }, {
	NVML_ERROR_TIMEOUT,
"User provided timeout passed" }, {
	NVML_ERROR_IRQ_ISSUE,
"NVIDIA Kernel detected an interrupt issue with a GPU" }, {
	NVML_ERROR_LIBRARY_NOT_FOUND,
"NVML Shared Library couldn't be found or loaded" }, {
	NVML_ERROR_FUNCTION_NOT_FOUND,
"Local version of NVML doesn't implement this function" }, {
	NVML_ERROR_CORRUPTED_INFOROM,
"infoROM is corrupted" }, {
	NVML_ERROR_GPU_IS_LOST,
"The GPU has fallen off the bus or has otherwise become inaccessible" }, {
	NVML_ERROR_UNKNOWN,
"An internal driver error occurred"
    } };

    for (i = 0; i < (sizeof(table)/sizeof(table[0])); i++) {
	if (table[i].code == sts)
	    return table[i].msg;
    }
    return unknown;
}