Skip to content
Snippets Groups Projects
Commit 0f4f64ba authored by floreal.risso's avatar floreal.risso
Browse files

add nvidia sensor

parents 40e443b2 82224cf5
Branches
Tags
2 merge requests!9fix sensor example (doc),!5Add dev name to labels
...@@ -4,12 +4,15 @@ ...@@ -4,12 +4,15 @@
# Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr> # Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr>
try() { "$@" || die "cannot $*"; } try() { "$@" || die "cannot $*"; }
die() { yell "$*"; exit 111; } die() {
yell "$*"
exit 111
}
yell() { echo "$0: $*" >&2; } yell() { echo "$0: $*" >&2; }
echo() { printf '%s\n' "$*"; } echo() { printf '%s\n' "$*"; }
isnum() { isnum() {
case "${1#[+-]}" in case "${1#[+-]}" in
*[!0-9]*|'') return 1 ;; *[!0-9]* | '') return 1 ;;
*) return 0 ;; *) return 0 ;;
esac esac
} }
...@@ -86,10 +89,10 @@ gen_sensors_h() { ...@@ -86,10 +89,10 @@ gen_sensors_h() {
printf ' int opt_idx = offset;\n' printf ' int opt_idx = offset;\n'
for sensor in $sensors; do for sensor in $sensors; do
cat <<-! cat <<-!
for (int i = 0; i < ${sensor}.nb_opt; i++) { for (int i = 0; i < ${sensor}.nb_opt; i++) {
opts[opt_idx++] = ${sensor}_opt[i]; opts[opt_idx++] = ${sensor}_opt[i];
} }
sensors[(*nb_defined)++] = ${sensor}; sensors[(*nb_defined)++] = ${sensor};
! !
done done
printf ' assert((offset + *nb_defined) <= len);\n' printf ' assert((offset + *nb_defined) <= len);\n'
...@@ -120,6 +123,12 @@ detect_caps() { ...@@ -120,6 +123,12 @@ detect_caps() {
[ -e "/sys/class/net/$dev" ] && hdr_whitelist="${hdr_whitelist}|network" [ -e "/sys/class/net/$dev" ] && hdr_whitelist="${hdr_whitelist}|network"
fi fi
if [ -e /usr/local/cuda/lib64 ] && [ -e /usr/local/cuda/include ]; then
hdr_whitelist="${hdr_whitelist}|nvidia_gpu"
NVML_LDFLAGS="-L/usr/local/cuda/lib64 -lnvidia-ml"
NVML_IFLAGS="-I/usr/local/cuda/include"
fi
vendor=$(awk '/vendor_id/ {print $3; exit}' /proc/cpuinfo) vendor=$(awk '/vendor_id/ {print $3; exit}' /proc/cpuinfo)
vendor_lc=$(echo "$vendor" | tr 'A-Z' 'a-z') vendor_lc=$(echo "$vendor" | tr 'A-Z' 'a-z')
case $vendor_lc in case $vendor_lc in
...@@ -141,7 +150,7 @@ detect_caps() { ...@@ -141,7 +150,7 @@ detect_caps() {
} }
case $1 in case $1 in
--all|-a) --all | -a)
all=1 all=1
;; ;;
esac esac
...@@ -149,30 +158,33 @@ esac ...@@ -149,30 +158,33 @@ esac
[ "$all" ] || detect_caps [ "$all" ] || detect_caps
[ "$all" ] || [ "$all" ] ||
while [ "$1" ]; do while [ "$1" ]; do
case $1 in case $1 in
--include|-i) --include | -i)
shift; [ "$1" ] || usage shift
hdr_whitelist="${hdr_whitelist}|${1}" [ "$1" ] || usage
;; hdr_whitelist="${hdr_whitelist}|${1}"
--exclude|-e) ;;
shift; [ "$1" ] || usage --exclude | -e)
hdr_blacklist="${hdr_blacklist}|${1}" shift
;; [ "$1" ] || usage
--list-sensors|-l) hdr_blacklist="${hdr_blacklist}|${1}"
ls_sensors ;;
exit 0 --list-sensors | -l)
;; ls_sensors
--unique|-u) exit 0
shift; [ "$1" ] || usage ;;
hdr_whitelist=$1 --unique | -u)
;; shift
--help|-h) [ "$1" ] || usage
usage hdr_whitelist=$1
;; ;;
esac --help | -h)
shift usage
done ;;
esac
shift
done
sensors=$(ls_sensors) sensors=$(ls_sensors)
nb_sensors=$(echo "$sensors" | sed '/^$/d' | wc -l) nb_sensors=$(echo "$sensors" | sed '/^$/d' | wc -l)
...@@ -182,12 +194,14 @@ if [ "$nb_sensors" -eq 0 ]; then ...@@ -182,12 +194,14 @@ if [ "$nb_sensors" -eq 0 ]; then
exit 1 exit 1
fi fi
try gen_sensors_h "$sensors" "$nb_sensors" > "$target_hdr" try gen_sensors_h "$sensors" "$nb_sensors" >"$target_hdr"
try gen_sensors_mk "$sensors" > "$target_mk" try gen_sensors_mk "$sensors" >"$target_mk"
try printf "NVML_LDFLAGS = %s\n" "$NVML_LDFLAGS" >>"$target_mk"
try printf "NVML_IFLAGS = %s\n" "$NVML_IFLAGS" >>"$target_mk"
printf -- 'Run `make` to build `bin/mojitos`.\n' >&2 printf -- 'Run `make` to build `bin/mojitos`.\n' >&2
printf -- 'The resulting binary will have the %d following sensors:\n' "$nb_sensors" >&2 printf -- 'The resulting binary will have the %d following sensors:\n' "$nb_sensors" >&2
echo "$sensors" >&2 echo "$sensors" >&2
make clean >/dev/null make clean >/dev/null
# Nvidia Gpu
The `nvidia_gpu` sensor provides basic information about the gpu. Depending on
the driver version it is possible that not all sensors are supported, so an
error message will be written to `stderr` but the execution will continue.
For more information you can consult the [nvidia nvml api](https://docs.nvidia.com/deploy/index.html).
## [Clock](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g805c0647be9996589fc5e3f6ff680c64)
All speeds are in Mhz.
|Output |Description |
|--------|-------------------------------|
|graphics|Graphics clock |
|sm |Streaming Multiprocessor clock |
|memory |Memory clock |
|video |Video encoder/decoder clock |
## [Memory](https://docs.nvidia.com/deploy/nvml-api/structnvmlMemory__t.html#structnvmlMemory__t)
All values are in bytes.
|Output |Description |
|--------|-------------------------------------|
|free |Unallocated device memory |
|used |Sum of Reserved and Allocated memory |
|total |Total physical device memory |
## [Utilization](https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t)
Utilization information for a device. Each sample period may be between 1
second and 1/6 second, depending on the product being queried.
All values are a percent of time over the past sample period.
|Output |Description |
|--------|---------------------|
|gpu | Usage of the GPU |
|memory | Usage of the Memory |
## [Power](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87)
Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
|Output |Description |
|--------|-------------------------|
|power | Power consumption in mW |
## [Temperature](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g2650b526841fa38b8f293c2d509a1de0)
Temperature of the GPU.
|Output |Description |
|------------|----------------------------|
|temperature | Temperature of the GPU die |
...@@ -10,15 +10,17 @@ BIN = mojitos ...@@ -10,15 +10,17 @@ BIN = mojitos
PREFIX = /usr/local PREFIX = /usr/local
CC = gcc CC = gcc
CPPFLAGS = -std=gnu99 -Wall -Wextra -Wpedantic -Wno-unused-function -I./lib CPPFLAGS = -std=gnu99 -Wall -Wextra -Wpedantic -Wno-unused-function -I./lib $(NVML_IFLAGS)
CFLAGS = $(CPPFLAGS) -O3 -Werror CFLAGS = $(CPPFLAGS) -O3 -Werror
LDFLAGS = LDFLAGS = $(NVML_LDFLAGS)
ASTYLE = astyle --style=kr -xf -s4 -k3 -n -Z -Q ASTYLE = astyle --style=kr -xf -s4 -k3 -n -Z -Q
all: $(BIN) man all: $(BIN) man
CAPTOR_OBJ = CAPTOR_OBJ =
NVML_LDFLAGS =
NVML_IFLAGS =
include ./sensors.mk include ./sensors.mk
...@@ -34,7 +36,7 @@ options: ...@@ -34,7 +36,7 @@ options:
@echo OBJ: $(OBJ) @echo OBJ: $(OBJ)
$(BIN): $(BIN_DIR) $(OBJ) $(OBJ_DIR)/$(BIN).o $(BIN): $(BIN_DIR) $(OBJ) $(OBJ_DIR)/$(BIN).o
$(CC) $(LDFLAGS) -o $(BIN_DIR)/$(BIN) $(OBJ) $(OBJ_DIR)/$(BIN).o $(CC) -o $(BIN_DIR)/$(BIN) $(OBJ) $(OBJ_DIR)/$(BIN).o $(LDFLAGS)
$(OBJ): $(OBJ_DIR) $(OBJ): $(OBJ_DIR)
$(OBJ_DIR)/counters.o: $(SRC_DIR)/counters_option.h $(OBJ_DIR)/counters.o: $(SRC_DIR)/counters_option.h
......
/*******************************************************
Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr>
This file is part of Mojitos.
Mojitos is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Mojitos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with MojitO/S. If not, see <https://www.gnu.org/licenses/>.
*******************************************************/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
// Pedantic throws a warning in the nvml library
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#include <nvml.h>
#pragma GCC diagnostic pop
#include "util.h"
// -----------------------------SENSOR_KIND
typedef enum {
CLOCK_SENSOR = 0,
MEMORY_SENSOR = 1,
UTILIZATION_SENSOR = 2,
POWER_SENSOR = 3,
TEMPERATURE_SENSOR = 4,
COUNT_SENSOR = 5,
} SENSOR_KIND;
typedef struct Device Device;
typedef struct NvidiaGpu NvidiaGpu;
typedef struct ISensor ISensor;
typedef struct Sensor Sensor;
// -- Sensor interface
typedef unsigned int (Initializer) (const Device *, void **);
typedef unsigned int (Getter) (uint64_t *, const Device *, void *);
typedef unsigned int (Labeller) (char **, void *);
typedef void (Cleaner) (void *);
struct ISensor {
Initializer *init;
Getter *get;
Labeller *label;
Cleaner *clean;
};
// -- Sensor
struct Sensor {
void *data;
const ISensor *fun;
};
// -- Device: represents a gpu
struct Device {
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
nvmlDevice_t device;
unsigned int idx;
Sensor sensors[COUNT_SENSOR];
unsigned int count;
};
// -- NvidiaGpu: represents the devices
struct NvidiaGpu {
Device *devices;
unsigned int count;
};
// -- Label template
static const char *label_template = "gpu%u_%s_%s";
static const char *short_label_template = "gpu%u_%s";
// ----------------------------CLOCK_SENSOR
#define CLOCK_LABEL_SIZE 25
// -- All existing clocks
// -- SM : Streaming Multiprocessor
static const nvmlClockType_t clocks[NVML_CLOCK_COUNT] = {NVML_CLOCK_GRAPHICS, NVML_CLOCK_SM, NVML_CLOCK_MEM, NVML_CLOCK_VIDEO};
static const char *clock_names[NVML_CLOCK_COUNT] = {"graphics", "sm", "memory", "video"};
static const char *clock_base_name = "clk";
// -- Must contain the clocks compatible with the device
typedef struct {
nvmlClockType_t clocks[NVML_CLOCK_COUNT];
char labels[NVML_CLOCK_COUNT][CLOCK_LABEL_SIZE];
unsigned int count;
} ClockData;
unsigned int init_clock_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
ClockData tmp = {0};
nvmlReturn_t err;
unsigned int clock;
// -- Test all clocks
for (unsigned int i = 0; i < NVML_CLOCK_COUNT; i++) {
if ((err = nvmlDeviceGetClockInfo(nvml_device, clocks[i], &clock)) == NVML_SUCCESS) {
snprintf(tmp.labels[tmp.count], CLOCK_LABEL_SIZE, label_template, device_idx, clock_base_name, clock_names[i]);
tmp.clocks[tmp.count] = clocks[i];
tmp.count += 1;
} else {
fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[i], nvmlErrorString(err));
}
}
// -- No clock avaible
if (tmp.count == 0) {
return 0;
}
*data = calloc(1, sizeof(ClockData));
memcpy(*data, &tmp, sizeof (ClockData));
return tmp.count;
}
unsigned int get_clock_sensor(uint64_t *results, const Device *device, void *data)
{
const nvmlDevice_t nvml_device = device->device;
ClockData *clock_data = (ClockData *) data;
nvmlReturn_t err;
unsigned int clock;
for (unsigned int i = 0; i < clock_data->count; i++) {
nvmlClockType_t clock_type = clock_data->clocks[i];
if((err = nvmlDeviceGetClockInfo(nvml_device, clock_type, &clock)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[clock_type], nvmlErrorString(err));
exit(99);
}
results[i] = clock;
}
return clock_data->count;
}
unsigned int label_clock_sensor(char **labels, void *data)
{
ClockData *clock_data = (ClockData *) data;
for (unsigned int i = 0; i < clock_data->count; i++) {
labels[i] = clock_data->labels[i];
}
return clock_data->count;
}
void clean_clock_sensor(void *data)
{
free(data);
}
// ---------------------------MEMORY_SENSOR
#define MEMORY_LABEL_SIZE 25
typedef enum {
FREE_MEMORY = 0U,
USED_MEMORY = 1U,
TOTAL_MEMORY = 2U,
COUNT_MEMORY = 3U,
} MemoryKind;
static const char *memory_names[COUNT_MEMORY] = {"free", "used", "total"};
static const char *memory_base_name = "mem";
typedef struct {
char labels[COUNT_MEMORY][MEMORY_LABEL_SIZE];
} MemoryData;
unsigned int init_memory_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
nvmlMemory_t memory;
nvmlReturn_t err;
if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err));
return 0;
}
MemoryData *memory_data = (MemoryData *) calloc(1, sizeof(MemoryData));
for (unsigned int i = 0; i < COUNT_MEMORY; i++) {
snprintf(memory_data->labels[i], MEMORY_LABEL_SIZE, label_template, device_idx, memory_base_name, memory_names[i]);
}
*data = (void *) memory_data;
return COUNT_MEMORY;
}
unsigned int get_memory_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
nvmlMemory_t memory;
nvmlReturn_t err;
if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err));
exit(99);
}
results[FREE_MEMORY] = memory.free;
results[USED_MEMORY] = memory.used;
results[TOTAL_MEMORY] = memory.total;
return COUNT_MEMORY;
}
unsigned int label_memory_sensor(char **labels, void *data)
{
MemoryData *memory_data = (MemoryData *) data;
for (unsigned int i = 0; i < COUNT_MEMORY; i++) {
labels[i] = memory_data->labels[i];
}
return COUNT_MEMORY;
}
void clean_memory_sensor(void *data)
{
free(data);
}
// ----------------------UTILIZATION_SENSOR
#define UTILIZATION_LABEL_SIZE 35
typedef enum {
GPU_UTILIZATION = 0U,
MEMORY_UTILIZATION = 1U,
COUNT_UTILIZATION = 2U,
} UtilizationKind;
typedef struct {
char labels[COUNT_UTILIZATION][UTILIZATION_LABEL_SIZE];
} UtilizationData;
static const char *utilization_names[COUNT_UTILIZATION] = {"gpu", "memory"};
static const char *utilization_base_name = "utilization";
unsigned int init_utilization_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
nvmlReturn_t err;
nvmlUtilization_t utilization;
if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err));
return 0;
}
UtilizationData *utilization_data = (UtilizationData *) calloc(1, sizeof(UtilizationData));
for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) {
snprintf(utilization_data->labels[i], UTILIZATION_LABEL_SIZE, label_template, device_idx, utilization_base_name, utilization_names[i]);
}
*data = (void *) utilization_data;
return COUNT_UTILIZATION;
}
unsigned int get_utilization_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
nvmlReturn_t err;
nvmlUtilization_t utilization;
if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err));
exit(99);
}
results[GPU_UTILIZATION] = utilization.gpu;
results[MEMORY_UTILIZATION] = utilization.memory;
return COUNT_UTILIZATION;
}
unsigned int label_utilization_sensor(char **labels, void *data)
{
UtilizationData *utilization_data = (UtilizationData *) data;
for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) {
labels[i] = utilization_data->labels[i];
}
return COUNT_UTILIZATION;
}
void clean_utilization_sensor(void *data)
{
free(data);
}
// ----------------------------POWER_SENSOR
#define POWER_LABEL_SIZE 25
#define COUNT_POWER 1
static const char *power_base_name = "power";
typedef struct {
char label[POWER_LABEL_SIZE];
} PowerData;
unsigned int init_power_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
unsigned int power;
nvmlReturn_t err;
if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) {
printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err));
return 0;
}
PowerData *power_data = (PowerData *) calloc(1, sizeof(PowerData));
snprintf(power_data->label, POWER_LABEL_SIZE, short_label_template, device_idx, power_base_name);
*data = (void *) power_data;
return COUNT_POWER;
}
unsigned int get_power_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
unsigned int power;
nvmlReturn_t err;
if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) {
printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err));
exit(99);
}
*results = power;
return COUNT_POWER;
}
unsigned int label_power_sensor(char **labels, void *data)
{
PowerData *power_data = (PowerData *) data;
*labels = power_data->label;
return COUNT_POWER;
}
void clean_power_sensor(void *data)
{
free(data);
}
// ----------------------TEMPERATURE_SENSOR
#define TEMPERATURE_LABEL_SIZE 35
#define COUNT_TEMPERATURE 1
static const char *temperature_base_name = "temperature";
typedef struct {
char label[TEMPERATURE_LABEL_SIZE];
} TemperatureData;
unsigned int init_temperature_sensor(const Device *device, void **data)
{
const nvmlDevice_t nvml_device = device->device;
const unsigned int device_idx = device->idx;
unsigned int temperature;
nvmlReturn_t err;
if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) {
printf("Failed to get the device temperature: %s\n", nvmlErrorString(err));
return 0;
}
TemperatureData *temperature_data = (TemperatureData *) calloc(1, sizeof(TemperatureData));
snprintf(temperature_data->label, TEMPERATURE_LABEL_SIZE, short_label_template, device_idx, temperature_base_name);
*data = (void *) temperature_data;
return COUNT_TEMPERATURE;
}
unsigned int get_temperature_sensor(uint64_t *results, const Device *device, void *none)
{
UNUSED(none);
const nvmlDevice_t nvml_device = device->device;
unsigned int temperature;
nvmlReturn_t err;
if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) {
printf("Failed to get the device temperature: %s\n", nvmlErrorString(err));
exit(99);
}
*results = temperature;
return COUNT_TEMPERATURE;
}
unsigned int label_temperature_sensor(char **labels, void *data)
{
TemperatureData *temperature_data = (TemperatureData *) data;
*labels = temperature_data->label;
return COUNT_TEMPERATURE;
}
void clean_temperature_sensor(void *data)
{
free(data);
}
// -------------------------AVAIBLE_SENSORS
static const ISensor avaible_sensors[COUNT_SENSOR] = {
{.init = init_clock_sensor, .get = get_clock_sensor, .label = label_clock_sensor, .clean = clean_clock_sensor},
{.init = init_memory_sensor, .get = get_memory_sensor, .label = label_memory_sensor, .clean = clean_memory_sensor},
{.init = init_utilization_sensor, .get = get_utilization_sensor, .label = label_utilization_sensor, .clean = clean_utilization_sensor},
{.init = init_power_sensor, .get = get_power_sensor, .label = label_power_sensor, .clean = clean_power_sensor},
{.init = init_temperature_sensor, .get = get_temperature_sensor, .label = label_temperature_sensor, .clean = clean_temperature_sensor},
};
// ------------------------DEVICE_FUNCTIONS
unsigned int init_device(unsigned int device_idx, Device *device)
{
nvmlReturn_t result;
nvmlDevice_t nvml_device;
if ((result = nvmlDeviceGetHandleByIndex(device_idx, &nvml_device)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device handle for device %d: %s\n", device_idx, nvmlErrorString(result));
return 0;
}
if ((result = nvmlDeviceGetName(nvml_device, device->name, NVML_DEVICE_NAME_BUFFER_SIZE))) {
fprintf(stderr, "Failed to get device name for device %d: %s\n", device_idx, nvmlErrorString(result));
return 0;
}
device->device = nvml_device;
device->idx = device_idx;
unsigned int sensor_count = 0;
unsigned int total_count = 0;
for (unsigned int i = 0; i < COUNT_SENSOR; i++) {
Sensor *sensor = &device->sensors[sensor_count];
sensor->fun = &avaible_sensors[i];
unsigned int count;
if ((count = sensor->fun->init(device, &sensor->data)) != 0) {
sensor_count += 1;
total_count += count;
}
}
device->count = sensor_count;
return total_count;
}
unsigned int get_device(uint64_t *results, Device *device)
{
unsigned int count = 0;
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
unsigned int result = sensor->fun->get(results, device, sensor->data);
count += result;
results += result;
}
return count;
}
unsigned int label_device(char **labels, Device *device)
{
unsigned int count = 0;
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
unsigned int result = sensor->fun->label(labels, sensor->data);
labels += result;
count += result;
}
return count;
}
void clean_device(Device *device)
{
for (unsigned int i = 0; i < device->count; i++) {
Sensor *sensor = &device->sensors[i];
sensor->fun->clean(sensor->data);
}
}
// ------------------------NVIDIA_INTERFACE
unsigned int init_nvidia_gpu(char *none, void **ptr)
{
UNUSED(none);
UNUSED(ptr);
nvmlReturn_t result;
if ((result = nvmlInit()) != NVML_SUCCESS) {
fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
exit(1);
}
unsigned int avaible_device_count;
if ((result = nvmlDeviceGetCount(&avaible_device_count)) != NVML_SUCCESS) {
fprintf(stderr, "Failed to get device count : %s\n", nvmlErrorString(result));
nvmlShutdown();
exit(1);
}
Device *devices = calloc(avaible_device_count, sizeof(Device));
unsigned int sensor_count = 0;
unsigned int device_count = 0;
for (unsigned int i = 0; i < avaible_device_count; i++) {
unsigned int initialized_count;
if ((initialized_count = init_device(i, &devices[device_count])) != 0) {
sensor_count += initialized_count;
device_count += 1;
}
}
NvidiaGpu *nvidia = (NvidiaGpu *) calloc(1, sizeof(NvidiaGpu));
nvidia->devices = devices;
nvidia->count = device_count;
*ptr = (void *) nvidia;
return sensor_count;
}
unsigned int get_nvidia_gpu(uint64_t *results, void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
unsigned count = 0;
for (unsigned int i = 0; i < nvidia->count; i++) {
unsigned int result = get_device(results, &nvidia->devices[i]);
results += result;
count += result;
}
return count;
}
unsigned int label_nvidia_gpu(char **labels, void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
unsigned count = 0;
for (unsigned int i = 0; i < nvidia->count; i++) {
unsigned int result = label_device(labels, &nvidia->devices[i]);
labels += result;
count += result;
}
return count;
}
void clean_nvidia_gpu(void *ptr)
{
NvidiaGpu *nvidia = (NvidiaGpu *) ptr;
for (unsigned int i = 0; i < nvidia->count; i++) {
clean_device(&nvidia->devices[i]);
}
free(nvidia->devices);
free(nvidia);
nvmlShutdown();
}
/*******************************************************
Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr>
This file is part of Mojitos.
Mojitos is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Mojitos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with MojitO/S. If not, see <https://www.gnu.org/licenses/>.
*******************************************************/
unsigned int init_nvidia_gpu(char *, void **);
unsigned int get_nvidia_gpu(uint64_t *results, void *);
void clean_nvidia_gpu(void *);
void label_nvidia_gpu(char **labels, void *);
Sensor nvidia_gpu = {
.init = init_nvidia_gpu,
.get = get_nvidia_gpu,
.clean = clean_nvidia_gpu,
.label = label_nvidia_gpu,
.nb_opt = 1,
};
Optparse nvidia_gpu_opt[1] = {
{
.longname = "nvidia-gpu",
.shortname = 'n',
.argtype = OPTPARSE_NONE,
.usage_arg = NULL,
.usage_msg = "provides basic gpu information [clocks, memory, utilization, power, temperature].",
},
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment