-
floreal.risso authoredfloreal.risso authored
nvidia_gpu.c 16.77 KiB
/*******************************************************
Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr>
This file is part of Mojitos.
Mojitos is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Mojitos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with MojitO/S. If not, see <https://www.gnu.org/licenses/>.
*******************************************************/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
// Pedantic throws a warning in the nvml library
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#include <nvml.h>
#pragma GCC diagnostic pop
#include "util.h"
// -----------------------------SENSOR_KIND
typedef enum {
CLOCK_SENSOR = 0,
MEMORY_SENSOR = 1,
UTILIZATION_SENSOR = 2,
POWER_SENSOR = 3,
TEMPERATURE_SENSOR = 4,
COUNT_SENSOR = 5,
} SENSOR_KIND;
typedef struct Device Device;
typedef struct NvidiaGpu NvidiaGpu;
typedef struct ISensor ISensor;
typedef struct Sensor Sensor;
// -- Sensor interface
typedef unsigned int (Initializer) (const Device *, void **);
typedef unsigned int (Getter) (uint64_t *, const Device *, void *);
typedef unsigned int (Labeller) (char **, void *);
typedef void (Cleaner) (void *);
struct ISensor {
Initializer *init;
Getter *get;
Labeller *label;
Cleaner *clean;
};
// -- Sensor
struct Sensor {
void *data;
const ISensor *fun;
};
// -- Device: represents a gpu
struct Device {
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
nvmlDevice_t device;
unsigned int idx;
Sensor sensors[COUNT_SENSOR];
unsigned int count;
};
// -- NvidiaGpu: represents the devices
struct NvidiaGpu {
Device *devices;
unsigned int count;
};
// -- Label template
static const char *label_template = "gpu%u_%s_%s";
static const char *short_label_template = "gpu%u_%s";
// ----------------------------CLOCK_SENSOR
#define CLOCK_LABEL_SIZE 25
// -- All existing clocks
// -- SM : Streaming Multiprocessor
static const nvmlClockType_t clocks[NVML_CLOCK_COUNT] = {NVML_CLOCK_GRAPHICS, NVML_CLOCK_SM, NVML_CLOCK_MEM, NVML_CLOCK_VIDEO};
static const char *clock_names[NVML_CLOCK_COUNT] = {"graphics", "sm", "memory", "video"};
static const char *clock_base_name = "clk";
// -- Must contain the clocks compatible with the device
typedef struct {
nvmlClockType_t clocks[NVML_CLOCK_COUNT];
char labels[NVML_CLOCK_COUNT][CLOCK_LABEL_SIZE];
unsigned int count;
} ClockData;
unsigned int init_clock_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
ClockData tmp = {0};
nvmlReturn_t err;
unsigned int clock;
// -- Test all clocks
for (unsigned int i = 0; i < NVML_CLOCK_COUNT; i++) {
if ((err = nvmlDeviceGetClockInfo(nvml_device, clocks[i], &clock)) == NVML_SUCCESS) {
snprintf(tmp.labels[tmp.count], CLOCK_LABEL_SIZE, label_template, device_idx, clock_base_name, clock_names[i]);
tmp.clocks[tmp.count] = clocks[i];
tmp.count += 1;
} else {
fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[i], nvmlErrorString(err));
}
}
// -- No clock avaible
if (tmp.count == 0) {
return 0;
}
*data = calloc(1, sizeof(ClockData));
memcpy(*data, &tmp, sizeof (ClockData));
return tmp.count;
}
unsigned int get_clock_sensor(uint64_t *results, const Device *device, void *data)
{
const nvmlDevice_t nvml_device = device->device;
ClockData *clock_data = (ClockData *) data;
nvmlReturn_t err;
unsigned int clock;
for (unsigned int i = 0; i < clock_data->count; i++) {
nvmlClockType_t clock_type = clock_data->clocks[i];
if((err = nvmlDeviceGetClockInfo(nvml_device, clock_type, &clock)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[clock_type], nvmlErrorString(err));
exit(99);
}
results[i] = clock;
}
return clock_data->count;
}
unsigned int label_clock_sensor(char **labels, void *data)
{
ClockData *clock_data = (ClockData *) data;
for (unsigned int i = 0; i < clock_data->count; i++) {
labels[i] = clock_data->labels[i];
}
return clock_data->count;
}
void clean_clock_sensor(void *data)
{
free(data);
}
// ---------------------------MEMORY_SENSOR
#define MEMORY_LABEL_SIZE 25
typedef enum {
FREE_MEMORY = 0U,
USED_MEMORY = 1U,
TOTAL_MEMORY = 2U,
COUNT_MEMORY = 3U,
} MemoryKind;
static const char *memory_names[COUNT_MEMORY] = {"free", "used", "total"};
static const char *memory_base_name = "mem";
typedef struct {
char labels[COUNT_MEMORY][MEMORY_LABEL_SIZE];
} MemoryData;
unsigned int init_memory_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
nvmlMemory_t memory;
nvmlReturn_t err;
if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err));
return 0;
}
MemoryData *memory_data = (MemoryData *) calloc(1, sizeof(MemoryData));
for (unsigned int i = 0; i < COUNT_MEMORY; i++) {
snprintf(memory_data->labels[i], MEMORY_LABEL_SIZE, label_template, device_idx, memory_base_name, memory_names[i]);
}
*data = (void *) memory_data;
return COUNT_MEMORY;
}
unsigned int get_memory_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
nvmlMemory_t memory;
nvmlReturn_t err;
if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err));
exit(99);
}
results[FREE_MEMORY] = memory.free;
results[USED_MEMORY] = memory.used;
results[TOTAL_MEMORY] = memory.total;
return COUNT_MEMORY;
}
unsigned int label_memory_sensor(char **labels, void *data)
{
MemoryData *memory_data = (MemoryData *) data;
for (unsigned int i = 0; i < COUNT_MEMORY; i++) {
labels[i] = memory_data->labels[i];
}
return COUNT_MEMORY;
}
void clean_memory_sensor(void *data)
{
free(data);
}
// ----------------------UTILIZATION_SENSOR
#define UTILIZATION_LABEL_SIZE 35
typedef enum {
GPU_UTILIZATION = 0U,
MEMORY_UTILIZATION = 1U,
COUNT_UTILIZATION = 2U,
} UtilizationKind;
typedef struct {
char labels[COUNT_UTILIZATION][UTILIZATION_LABEL_SIZE];
} UtilizationData;
static const char *utilization_names[COUNT_UTILIZATION] = {"gpu", "memory"};
static const char *utilization_base_name = "utilization";
unsigned int init_utilization_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
nvmlReturn_t err;
nvmlUtilization_t utilization;
if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err));
return 0;
}
UtilizationData *utilization_data = (UtilizationData *) calloc(1, sizeof(UtilizationData));
for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) {
snprintf(utilization_data->labels[i], UTILIZATION_LABEL_SIZE, label_template, device_idx, utilization_base_name, utilization_names[i]);
}
*data = (void *) utilization_data;
return COUNT_UTILIZATION;
}
unsigned int get_utilization_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
nvmlReturn_t err;
nvmlUtilization_t utilization;
if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err));
exit(99);
}
results[GPU_UTILIZATION] = utilization.gpu;
results[MEMORY_UTILIZATION] = utilization.memory;
return COUNT_UTILIZATION;
}
unsigned int label_utilization_sensor(char **labels, void *data)
{
UtilizationData *utilization_data = (UtilizationData *) data;
for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) {
labels[i] = utilization_data->labels[i];
}
return COUNT_UTILIZATION;
}
void clean_utilization_sensor(void *data)
{
free(data);
}
// ----------------------------POWER_SENSOR
#define POWER_LABEL_SIZE 25
#define COUNT_POWER 1
static const char *power_base_name = "power";
typedef struct {
char label[POWER_LABEL_SIZE];
} PowerData;
unsigned int init_power_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
unsigned int power;
nvmlReturn_t err;
if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) {
printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err));
return 0;
}
PowerData *power_data = (PowerData *) calloc(1, sizeof(PowerData));
snprintf(power_data->label, POWER_LABEL_SIZE, short_label_template, device_idx, power_base_name);
*data = (void *) power_data;
return COUNT_POWER;
}
unsigned int get_power_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
unsigned int power;
nvmlReturn_t err;
if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) {
printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err));
exit(99);
}
*results = power;
return COUNT_POWER;
}
unsigned int label_power_sensor(char **labels, void *data)
{
PowerData *power_data = (PowerData *) data;
*labels = power_data->label;
return COUNT_POWER;
}
void clean_power_sensor(void *data)
{
free(data);
}
// ----------------------TEMPERATURE_SENSOR
#define TEMPERATURE_LABEL_SIZE 35
#define COUNT_TEMPERATURE 1
static const char *temperature_base_name = "temperature";
typedef struct {
char label[TEMPERATURE_LABEL_SIZE];
} TemperatureData;
unsigned int init_temperature_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
unsigned int temperature;
nvmlReturn_t err;
if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) {
printf("Failed to get the device temperature: %s\n", nvmlErrorString(err));
return 0;
}
TemperatureData *temperature_data = (TemperatureData *) calloc(1, sizeof(TemperatureData));
snprintf(temperature_data->label, TEMPERATURE_LABEL_SIZE, short_label_template, device_idx, temperature_base_name);
*data = (void *) temperature_data;
return COUNT_TEMPERATURE;
}
unsigned int get_temperature_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
unsigned int temperature;
nvmlReturn_t err;
if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) {
printf("Failed to get the device temperature: %s\n", nvmlErrorString(err));
exit(99);
}
*results = temperature;
return COUNT_TEMPERATURE;
}
unsigned int label_temperature_sensor(char **labels, void *data)
{
TemperatureData *temperature_data = (TemperatureData *) data;
*labels = temperature_data->label;
return COUNT_TEMPERATURE;
}
void clean_temperature_sensor(void *data)
{
free(data);
}
// -------------------------AVAIBLE_SENSORS
static const ISensor avaible_sensors[COUNT_SENSOR] = {
{.init = init_clock_sensor, .get = get_clock_sensor, .label = label_clock_sensor, .clean = clean_clock_sensor},
{.init = init_memory_sensor, .get = get_memory_sensor, .label = label_memory_sensor, .clean = clean_memory_sensor},
{.init = init_utilization_sensor, .get = get_utilization_sensor, .label = label_utilization_sensor, .clean = clean_utilization_sensor},
{.init = init_power_sensor, .get = get_power_sensor, .label = label_power_sensor, .clean = clean_power_sensor},
{.init = init_temperature_sensor, .get = get_temperature_sensor, .label = label_temperature_sensor, .clean = clean_temperature_sensor},
};
// ------------------------DEVICE_FUNCTIONS
unsigned int init_device(unsigned int device_idx, Device *device)
{
nvmlReturn_t result;
nvmlDevice_t nvml_device;
if ((result = nvmlDeviceGetHandleByIndex(device_idx, &nvml_device)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device handle for device %d: %s\n", device_idx, nvmlErrorString(result));
return 0;
}
if ((result = nvmlDeviceGetName(nvml_device, device->name, NVML_DEVICE_NAME_BUFFER_SIZE))) {
fprintf(stderr, "Failed to get device name for device %d: %s\n", device_idx, nvmlErrorString(result));
return 0;
}
device->device = nvml_device;
device->idx = device_idx;
unsigned int sensor_count = 0;
unsigned int total_count = 0;
for (unsigned int i = 0; i < COUNT_SENSOR; i++) {
Sensor *sensor = &device->sensors[sensor_count];
sensor->fun = &avaible_sensors[i];
unsigned int count;
if ((count = sensor->fun->init(device, &sensor->data)) != 0) {
sensor_count += 1;
total_count += count;
}
}
device->count = sensor_count;
return total_count;
}
unsigned int get_device(uint64_t *results, Device *device)
{
unsigned int count = 0;
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
unsigned int result = sensor->fun->get(results, device, sensor->data);
count += result;
results += result;
}
return count;
}
unsigned int label_device(char **labels, Device *device)
{
unsigned int count = 0;
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
unsigned int result = sensor->fun->label(labels, sensor->data);
labels += result;
count += result;
}
return count;
}
void clean_device(Device *device)
{
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
sensor->fun->clean(sensor->data);
}
}
// ------------------------NVIDIA_INTERFACE
unsigned int init_nvidia_gpu(char *none, void **ptr)
{
UNUSED(none);
UNUSED(ptr);
nvmlReturn_t result;
if ((result = nvmlInit()) != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
exit(1);
}
unsigned int avaible_device_count;
if ((result = nvmlDeviceGetCount(&avaible_device_count)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count : %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}
Device *devices = calloc(avaible_device_count, sizeof(Device));
unsigned int sensor_count = 0;
unsigned int device_count = 0;
for (unsigned int i = 0; i < avaible_device_count; i++) {
unsigned int initialized_count;
if ((initialized_count = init_device(i, &devices[device_count])) != 0) {
sensor_count += initialized_count;
device_count += 1;
}
}
NvidiaGpu *nvidia = (NvidiaGpu *) calloc(1, sizeof(NvidiaGpu));
nvidia->devices = devices;
nvidia->count = device_count;
*ptr = (void *) nvidia;
return sensor_count;
}
unsigned int get_nvidia_gpu(uint64_t *results, void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
unsigned count = 0;
for (unsigned int i = 0; i < nvidia->count; i++) {
unsigned int result = get_device(results, &nvidia->devices[i]);
results += result;
count += result;
}
return count;
}
unsigned int label_nvidia_gpu(char **labels, void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
unsigned count = 0;
for (unsigned int i = 0; i < nvidia->count; i++) {
unsigned int result = label_device(labels, &nvidia->devices[i]);
labels += result;
count += result;
}
return count;
}
void clean_nvidia_gpu(void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
for (unsigned int i = 0; i < nvidia->count; i++) {
clean_device(&nvidia->devices[i]);
}
free(nvidia->devices);
free(nvidia);
nvmlShutdown();
}