diff --git a/configure.sh b/configure.sh index 203fd04a0e6663c096db02d5001d8192bf47f564..740e87a4b6daf33ec094a3441e048d696962057d 100755 --- a/configure.sh +++ b/configure.sh @@ -4,12 +4,15 @@ # Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr> try() { "$@" || die "cannot $*"; } -die() { yell "$*"; exit 111; } +die() { + yell "$*" + exit 111 +} yell() { echo "$0: $*" >&2; } echo() { printf '%s\n' "$*"; } isnum() { case "${1#[+-]}" in - *[!0-9]*|'') return 1 ;; + *[!0-9]* | '') return 1 ;; *) return 0 ;; esac } @@ -86,10 +89,10 @@ gen_sensors_h() { printf ' int opt_idx = offset;\n' for sensor in $sensors; do cat <<-! - for (int i = 0; i < ${sensor}.nb_opt; i++) { - opts[opt_idx++] = ${sensor}_opt[i]; - } - sensors[(*nb_defined)++] = ${sensor}; + for (int i = 0; i < ${sensor}.nb_opt; i++) { + opts[opt_idx++] = ${sensor}_opt[i]; + } + sensors[(*nb_defined)++] = ${sensor}; ! done printf ' assert((offset + *nb_defined) <= len);\n' @@ -120,6 +123,12 @@ detect_caps() { [ -e "/sys/class/net/$dev" ] && hdr_whitelist="${hdr_whitelist}|network" fi + if [ -e /usr/local/cuda/lib64 ] && [ -e /usr/local/cuda/include ]; then + hdr_whitelist="${hdr_whitelist}|nvidia_gpu" + NVML_LDFLAGS="-L/usr/local/cuda/lib64 -lnvidia-ml" + NVML_IFLAGS="-I/usr/local/cuda/include" + fi + vendor=$(awk '/vendor_id/ {print $3; exit}' /proc/cpuinfo) vendor_lc=$(echo "$vendor" | tr 'A-Z' 'a-z') case $vendor_lc in @@ -141,7 +150,7 @@ detect_caps() { } case $1 in ---all|-a) +--all | -a) all=1 ;; esac @@ -149,30 +158,33 @@ esac [ "$all" ] || detect_caps [ "$all" ] || -while [ "$1" ]; do - case $1 in - --include|-i) - shift; [ "$1" ] || usage - hdr_whitelist="${hdr_whitelist}|${1}" - ;; - --exclude|-e) - shift; [ "$1" ] || usage - hdr_blacklist="${hdr_blacklist}|${1}" - ;; - --list-sensors|-l) - ls_sensors - exit 0 - ;; - --unique|-u) - shift; [ "$1" ] || usage - hdr_whitelist=$1 - ;; - --help|-h) - usage - ;; - esac - shift -done + while [ "$1" ]; do + case $1 in + --include | -i) + shift + [ "$1" ] || usage + hdr_whitelist="${hdr_whitelist}|${1}" + ;; + --exclude | -e) + shift + [ "$1" ] || usage + hdr_blacklist="${hdr_blacklist}|${1}" + ;; + --list-sensors | -l) + ls_sensors + exit 0 + ;; + --unique | -u) + shift + [ "$1" ] || usage + hdr_whitelist=$1 + ;; + --help | -h) + usage + ;; + esac + shift + done sensors=$(ls_sensors) nb_sensors=$(echo "$sensors" | sed '/^$/d' | wc -l) @@ -182,12 +194,14 @@ if [ "$nb_sensors" -eq 0 ]; then exit 1 fi -try gen_sensors_h "$sensors" "$nb_sensors" > "$target_hdr" -try gen_sensors_mk "$sensors" > "$target_mk" +try gen_sensors_h "$sensors" "$nb_sensors" >"$target_hdr" +try gen_sensors_mk "$sensors" >"$target_mk" + +try printf "NVML_LDFLAGS = %s\n" "$NVML_LDFLAGS" >>"$target_mk" +try printf "NVML_IFLAGS = %s\n" "$NVML_IFLAGS" >>"$target_mk" printf -- 'Run `make` to build `bin/mojitos`.\n' >&2 printf -- 'The resulting binary will have the %d following sensors:\n' "$nb_sensors" >&2 echo "$sensors" >&2 make clean >/dev/null - diff --git a/doc/nvidia_gpu.md b/doc/nvidia_gpu.md new file mode 100644 index 0000000000000000000000000000000000000000..15cc8aa073dfe6cf49de4e820061793d24b55331 --- /dev/null +++ b/doc/nvidia_gpu.md @@ -0,0 +1,57 @@ +# Nvidia Gpu + +The `nvidia_gpu` sensor provides basic information about the gpu. Depending on +the driver version it is possible that not all sensors are supported, so an +error message will be written to `stderr` but the execution will continue. + +For more information you can consult the [nvidia nvml api](https://docs.nvidia.com/deploy/index.html). + +## [Clock](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g805c0647be9996589fc5e3f6ff680c64) + +All speeds are in Mhz. + +|Output |Description | +|--------|-------------------------------| +|graphics|Graphics clock | +|sm |Streaming Multiprocessor clock | +|memory |Memory clock | +|video |Video encoder/decoder clock | + +## [Memory](https://docs.nvidia.com/deploy/nvml-api/structnvmlMemory__t.html#structnvmlMemory__t) + +All values are in bytes. + +|Output |Description | +|--------|-------------------------------------| +|free |Unallocated device memory | +|used |Sum of Reserved and Allocated memory | +|total |Total physical device memory | + + +## [Utilization](https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t) + +Utilization information for a device. Each sample period may be between 1 +second and 1/6 second, depending on the product being queried. + +All values are a percent of time over the past sample period. + +|Output |Description | +|--------|---------------------| +|gpu | Usage of the GPU | +|memory | Usage of the Memory | + +## [Power](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87) + +Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + +|Output |Description | +|--------|-------------------------| +|power | Power consumption in mW | + +## [Temperature](https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1g2650b526841fa38b8f293c2d509a1de0) +Temperature of the GPU. + +|Output |Description | +|------------|----------------------------| +|temperature | Temperature of the GPU die | + diff --git a/makefile b/makefile index bd2630800992e58989b38ad0ee8464a4ed838095..f25d5630f9f65be71fb7220785bc97a30ea6030a 100644 --- a/makefile +++ b/makefile @@ -10,15 +10,17 @@ BIN = mojitos PREFIX = /usr/local CC = gcc -CPPFLAGS = -std=gnu99 -Wall -Wextra -Wpedantic -Wno-unused-function -I./lib +CPPFLAGS = -std=gnu99 -Wall -Wextra -Wpedantic -Wno-unused-function -I./lib $(NVML_IFLAGS) CFLAGS = $(CPPFLAGS) -O3 -Werror -LDFLAGS = +LDFLAGS = $(NVML_LDFLAGS) ASTYLE = astyle --style=kr -xf -s4 -k3 -n -Z -Q all: $(BIN) man CAPTOR_OBJ = +NVML_LDFLAGS = +NVML_IFLAGS = include ./sensors.mk @@ -34,7 +36,7 @@ options: @echo OBJ: $(OBJ) $(BIN): $(BIN_DIR) $(OBJ) $(OBJ_DIR)/$(BIN).o - $(CC) $(LDFLAGS) -o $(BIN_DIR)/$(BIN) $(OBJ) $(OBJ_DIR)/$(BIN).o + $(CC) -o $(BIN_DIR)/$(BIN) $(OBJ) $(OBJ_DIR)/$(BIN).o $(LDFLAGS) $(OBJ): $(OBJ_DIR) $(OBJ_DIR)/counters.o: $(SRC_DIR)/counters_option.h diff --git a/src/nvidia_gpu.c b/src/nvidia_gpu.c new file mode 100644 index 0000000000000000000000000000000000000000..407800271a09ab0546017018f78f8fed999602bd --- /dev/null +++ b/src/nvidia_gpu.c @@ -0,0 +1,593 @@ +/******************************************************* + Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr> + + This file is part of Mojitos. + + Mojitos is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Mojitos is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with MojitO/S. If not, see <https://www.gnu.org/licenses/>. + + *******************************************************/ + +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +// Pedantic throws a warning in the nvml library +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#include <nvml.h> +#pragma GCC diagnostic pop + +#include "util.h" + +// -----------------------------SENSOR_KIND +typedef enum { + CLOCK_SENSOR = 0, + MEMORY_SENSOR = 1, + UTILIZATION_SENSOR = 2, + POWER_SENSOR = 3, + TEMPERATURE_SENSOR = 4, + + COUNT_SENSOR = 5, +} SENSOR_KIND; + +typedef struct Device Device; +typedef struct NvidiaGpu NvidiaGpu; +typedef struct ISensor ISensor; +typedef struct Sensor Sensor; + +// -- Sensor interface +typedef unsigned int (Initializer) (const Device *, void **); +typedef unsigned int (Getter) (uint64_t *, const Device *, void *); +typedef unsigned int (Labeller) (char **, void *); +typedef void (Cleaner) (void *); + +struct ISensor { + Initializer *init; + Getter *get; + Labeller *label; + Cleaner *clean; +}; + +// -- Sensor +struct Sensor { + void *data; + const ISensor *fun; +}; + +// -- Device: represents a gpu +struct Device { + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + nvmlDevice_t device; + unsigned int idx; + + Sensor sensors[COUNT_SENSOR]; + unsigned int count; +}; + +// -- NvidiaGpu: represents the devices +struct NvidiaGpu { + Device *devices; + unsigned int count; +}; + +// -- Label template +static const char *label_template = "gpu%u_%s_%s"; +static const char *short_label_template = "gpu%u_%s"; + +// ----------------------------CLOCK_SENSOR + +#define CLOCK_LABEL_SIZE 25 + +// -- All existing clocks +// -- SM : Streaming Multiprocessor +static const nvmlClockType_t clocks[NVML_CLOCK_COUNT] = {NVML_CLOCK_GRAPHICS, NVML_CLOCK_SM, NVML_CLOCK_MEM, NVML_CLOCK_VIDEO}; +static const char *clock_names[NVML_CLOCK_COUNT] = {"graphics", "sm", "memory", "video"}; +static const char *clock_base_name = "clk"; + +// -- Must contain the clocks compatible with the device +typedef struct { + nvmlClockType_t clocks[NVML_CLOCK_COUNT]; + char labels[NVML_CLOCK_COUNT][CLOCK_LABEL_SIZE]; + unsigned int count; +} ClockData; + +unsigned int init_clock_sensor(const Device *device, void **data) +{ + const nvmlDevice_t nvml_device = device->device; + const unsigned int device_idx = device->idx; + ClockData tmp = {0}; + nvmlReturn_t err; + unsigned int clock; + + // -- Test all clocks + for (unsigned int i = 0; i < NVML_CLOCK_COUNT; i++) { + if ((err = nvmlDeviceGetClockInfo(nvml_device, clocks[i], &clock)) == NVML_SUCCESS) { + snprintf(tmp.labels[tmp.count], CLOCK_LABEL_SIZE, label_template, device_idx, clock_base_name, clock_names[i]); + tmp.clocks[tmp.count] = clocks[i]; + tmp.count += 1; + } else { + fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[i], nvmlErrorString(err)); + } + } + + // -- No clock avaible + if (tmp.count == 0) { + return 0; + } + + *data = calloc(1, sizeof(ClockData)); + memcpy(*data, &tmp, sizeof (ClockData)); + return tmp.count; +} + +unsigned int get_clock_sensor(uint64_t *results, const Device *device, void *data) +{ + const nvmlDevice_t nvml_device = device->device; + ClockData *clock_data = (ClockData *) data; + nvmlReturn_t err; + unsigned int clock; + + for (unsigned int i = 0; i < clock_data->count; i++) { + nvmlClockType_t clock_type = clock_data->clocks[i]; + + if((err = nvmlDeviceGetClockInfo(nvml_device, clock_type, &clock)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get %s clock : %s\n", clock_names[clock_type], nvmlErrorString(err)); + exit(99); + } + results[i] = clock; + } + return clock_data->count; +} + +unsigned int label_clock_sensor(char **labels, void *data) +{ + ClockData *clock_data = (ClockData *) data; + + for (unsigned int i = 0; i < clock_data->count; i++) { + labels[i] = clock_data->labels[i]; + } + + return clock_data->count; +} + +void clean_clock_sensor(void *data) +{ + free(data); +} + +// ---------------------------MEMORY_SENSOR +#define MEMORY_LABEL_SIZE 25 + +typedef enum { + FREE_MEMORY = 0U, + USED_MEMORY = 1U, + TOTAL_MEMORY = 2U, + + COUNT_MEMORY = 3U, +} MemoryKind; + +static const char *memory_names[COUNT_MEMORY] = {"free", "used", "total"}; +static const char *memory_base_name = "mem"; + +typedef struct { + char labels[COUNT_MEMORY][MEMORY_LABEL_SIZE]; +} MemoryData; + +unsigned int init_memory_sensor(const Device *device, void **data) +{ + const nvmlDevice_t nvml_device = device->device; + const unsigned int device_idx = device->idx; + + nvmlMemory_t memory; + nvmlReturn_t err; + if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err)); + return 0; + } + + MemoryData *memory_data = (MemoryData *) calloc(1, sizeof(MemoryData)); + for (unsigned int i = 0; i < COUNT_MEMORY; i++) { + snprintf(memory_data->labels[i], MEMORY_LABEL_SIZE, label_template, device_idx, memory_base_name, memory_names[i]); + } + + *data = (void *) memory_data; + return COUNT_MEMORY; +} + +unsigned int get_memory_sensor(uint64_t *results, const Device *device, void *none) +{ + UNUSED(none); + const nvmlDevice_t nvml_device = device->device; + + nvmlMemory_t memory; + nvmlReturn_t err; + if ((err = nvmlDeviceGetMemoryInfo(nvml_device, &memory)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device memory : %s\n", nvmlErrorString(err)); + exit(99); + } + + results[FREE_MEMORY] = memory.free; + results[USED_MEMORY] = memory.used; + results[TOTAL_MEMORY] = memory.total; + return COUNT_MEMORY; +} + + +unsigned int label_memory_sensor(char **labels, void *data) +{ + MemoryData *memory_data = (MemoryData *) data; + + for (unsigned int i = 0; i < COUNT_MEMORY; i++) { + labels[i] = memory_data->labels[i]; + } + + return COUNT_MEMORY; +} +void clean_memory_sensor(void *data) +{ + free(data); +} + +// ----------------------UTILIZATION_SENSOR +#define UTILIZATION_LABEL_SIZE 35 +typedef enum { + GPU_UTILIZATION = 0U, + MEMORY_UTILIZATION = 1U, + + COUNT_UTILIZATION = 2U, +} UtilizationKind; + +typedef struct { + char labels[COUNT_UTILIZATION][UTILIZATION_LABEL_SIZE]; +} UtilizationData; + +static const char *utilization_names[COUNT_UTILIZATION] = {"gpu", "memory"}; +static const char *utilization_base_name = "utilization"; + +unsigned int init_utilization_sensor(const Device *device, void **data) +{ + const nvmlDevice_t nvml_device = device->device; + const unsigned int device_idx = device->idx; + + nvmlReturn_t err; + nvmlUtilization_t utilization; + if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err)); + return 0; + } + + UtilizationData *utilization_data = (UtilizationData *) calloc(1, sizeof(UtilizationData)); + for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) { + snprintf(utilization_data->labels[i], UTILIZATION_LABEL_SIZE, label_template, device_idx, utilization_base_name, utilization_names[i]); + } + + *data = (void *) utilization_data; + return COUNT_UTILIZATION; +} + +unsigned int get_utilization_sensor(uint64_t *results, const Device *device, void *none) +{ + UNUSED(none); + const nvmlDevice_t nvml_device = device->device; + + nvmlReturn_t err; + nvmlUtilization_t utilization; + if ((err = nvmlDeviceGetUtilizationRates(nvml_device, &utilization)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device utilization: %s\n", nvmlErrorString(err)); + exit(99); + } + + results[GPU_UTILIZATION] = utilization.gpu; + results[MEMORY_UTILIZATION] = utilization.memory; + return COUNT_UTILIZATION; +} + +unsigned int label_utilization_sensor(char **labels, void *data) +{ + UtilizationData *utilization_data = (UtilizationData *) data; + + for (unsigned int i = 0; i < COUNT_UTILIZATION; i++) { + labels[i] = utilization_data->labels[i]; + } + + return COUNT_UTILIZATION; +} + +void clean_utilization_sensor(void *data) +{ + free(data); +} + +// ----------------------------POWER_SENSOR + +#define POWER_LABEL_SIZE 25 +#define COUNT_POWER 1 + +static const char *power_base_name = "power"; + +typedef struct { + char label[POWER_LABEL_SIZE]; +} PowerData; + + +unsigned int init_power_sensor(const Device *device, void **data) +{ + const nvmlDevice_t nvml_device = device->device; + const unsigned int device_idx = device->idx; + + unsigned int power; + nvmlReturn_t err; + if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) { + printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err)); + return 0; + } + + PowerData *power_data = (PowerData *) calloc(1, sizeof(PowerData)); + snprintf(power_data->label, POWER_LABEL_SIZE, short_label_template, device_idx, power_base_name); + + *data = (void *) power_data; + return COUNT_POWER; +} + +unsigned int get_power_sensor(uint64_t *results, const Device *device, void *none) +{ + UNUSED(none); + const nvmlDevice_t nvml_device = device->device; + + unsigned int power; + nvmlReturn_t err; + if ((err = nvmlDeviceGetPowerUsage(nvml_device, &power)) != NVML_SUCCESS) { + printf("Failed to get the device power consumption: %s\n", nvmlErrorString(err)); + exit(99); + } + + *results = power; + return COUNT_POWER; +} + +unsigned int label_power_sensor(char **labels, void *data) +{ + PowerData *power_data = (PowerData *) data; + *labels = power_data->label; + return COUNT_POWER; +} + +void clean_power_sensor(void *data) +{ + free(data); +} + +// ----------------------TEMPERATURE_SENSOR + + +#define TEMPERATURE_LABEL_SIZE 35 +#define COUNT_TEMPERATURE 1 + +static const char *temperature_base_name = "temperature"; + +typedef struct { + char label[TEMPERATURE_LABEL_SIZE]; +} TemperatureData; + +unsigned int init_temperature_sensor(const Device *device, void **data) +{ + const nvmlDevice_t nvml_device = device->device; + const unsigned int device_idx = device->idx; + + unsigned int temperature; + nvmlReturn_t err; + if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) { + printf("Failed to get the device temperature: %s\n", nvmlErrorString(err)); + return 0; + } + + TemperatureData *temperature_data = (TemperatureData *) calloc(1, sizeof(TemperatureData)); + snprintf(temperature_data->label, TEMPERATURE_LABEL_SIZE, short_label_template, device_idx, temperature_base_name); + + *data = (void *) temperature_data; + return COUNT_TEMPERATURE; +} + +unsigned int get_temperature_sensor(uint64_t *results, const Device *device, void *none) +{ + UNUSED(none); + const nvmlDevice_t nvml_device = device->device; + + unsigned int temperature; + nvmlReturn_t err; + if ((err = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temperature)) != NVML_SUCCESS) { + printf("Failed to get the device temperature: %s\n", nvmlErrorString(err)); + exit(99); + } + + *results = temperature; + return COUNT_TEMPERATURE; +} + +unsigned int label_temperature_sensor(char **labels, void *data) +{ + TemperatureData *temperature_data = (TemperatureData *) data; + *labels = temperature_data->label; + return COUNT_TEMPERATURE; +} + +void clean_temperature_sensor(void *data) +{ + free(data); +} + +// -------------------------AVAIBLE_SENSORS +static const ISensor avaible_sensors[COUNT_SENSOR] = { + {.init = init_clock_sensor, .get = get_clock_sensor, .label = label_clock_sensor, .clean = clean_clock_sensor}, + {.init = init_memory_sensor, .get = get_memory_sensor, .label = label_memory_sensor, .clean = clean_memory_sensor}, + {.init = init_utilization_sensor, .get = get_utilization_sensor, .label = label_utilization_sensor, .clean = clean_utilization_sensor}, + {.init = init_power_sensor, .get = get_power_sensor, .label = label_power_sensor, .clean = clean_power_sensor}, + {.init = init_temperature_sensor, .get = get_temperature_sensor, .label = label_temperature_sensor, .clean = clean_temperature_sensor}, +}; + +// ------------------------DEVICE_FUNCTIONS + +unsigned int init_device(unsigned int device_idx, Device *device) +{ + nvmlReturn_t result; + nvmlDevice_t nvml_device; + if ((result = nvmlDeviceGetHandleByIndex(device_idx, &nvml_device)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device handle for device %d: %s\n", device_idx, nvmlErrorString(result)); + return 0; + } + + if ((result = nvmlDeviceGetName(nvml_device, device->name, NVML_DEVICE_NAME_BUFFER_SIZE))) { + fprintf(stderr, "Failed to get device name for device %d: %s\n", device_idx, nvmlErrorString(result)); + return 0; + } + + device->device = nvml_device; + device->idx = device_idx; + + unsigned int sensor_count = 0; + unsigned int total_count = 0; + + for (unsigned int i = 0; i < COUNT_SENSOR; i++) { + Sensor *sensor = &device->sensors[sensor_count]; + sensor->fun = &avaible_sensors[i]; + unsigned int count; + + if ((count = sensor->fun->init(device, &sensor->data)) != 0) { + sensor_count += 1; + total_count += count; + } + } + + device->count = sensor_count; + return total_count; +} + +unsigned int get_device(uint64_t *results, Device *device) +{ + unsigned int count = 0; + for (unsigned int i = 0; i < device->count; i++) { + Sensor *sensor = &device->sensors[i]; + unsigned int result = sensor->fun->get(results, device, sensor->data); + count += result; + results += result; + } + + return count; +} + +unsigned int label_device(char **labels, Device *device) +{ + unsigned int count = 0; + for (unsigned int i = 0; i < device->count; i++) { + Sensor *sensor = &device->sensors[i]; + unsigned int result = sensor->fun->label(labels, sensor->data); + labels += result; + count += result; + } + + return count; +} + +void clean_device(Device *device) +{ + for (unsigned int i = 0; i < device->count; i++) { + Sensor *sensor = &device->sensors[i]; + sensor->fun->clean(sensor->data); + } +} + + +// ------------------------NVIDIA_INTERFACE + +unsigned int init_nvidia_gpu(char *none, void **ptr) +{ + UNUSED(none); + UNUSED(ptr); + + nvmlReturn_t result; + if ((result = nvmlInit()) != NVML_SUCCESS) { + fprintf(stderr, "Failed to initialize NVML: %s\n", nvmlErrorString(result)); + exit(1); + } + + unsigned int avaible_device_count; + if ((result = nvmlDeviceGetCount(&avaible_device_count)) != NVML_SUCCESS) { + fprintf(stderr, "Failed to get device count : %s\n", nvmlErrorString(result)); + nvmlShutdown(); + exit(1); + } + + Device *devices = calloc(avaible_device_count, sizeof(Device)); + + unsigned int sensor_count = 0; + unsigned int device_count = 0; + for (unsigned int i = 0; i < avaible_device_count; i++) { + unsigned int initialized_count; + if ((initialized_count = init_device(i, &devices[device_count])) != 0) { + sensor_count += initialized_count; + device_count += 1; + } + } + + NvidiaGpu *nvidia = (NvidiaGpu *) calloc(1, sizeof(NvidiaGpu)); + nvidia->devices = devices; + nvidia->count = device_count; + + *ptr = (void *) nvidia; + return sensor_count; +} + + +unsigned int get_nvidia_gpu(uint64_t *results, void *ptr) +{ + NvidiaGpu *nvidia = (NvidiaGpu *) ptr; + unsigned count = 0; + + for (unsigned int i = 0; i < nvidia->count; i++) { + unsigned int result = get_device(results, &nvidia->devices[i]); + results += result; + count += result; + } + + return count; +} + +unsigned int label_nvidia_gpu(char **labels, void *ptr) +{ + NvidiaGpu *nvidia = (NvidiaGpu *) ptr; + unsigned count = 0; + + for (unsigned int i = 0; i < nvidia->count; i++) { + unsigned int result = label_device(labels, &nvidia->devices[i]); + labels += result; + count += result; + } + + return count; +} + +void clean_nvidia_gpu(void *ptr) +{ + NvidiaGpu *nvidia = (NvidiaGpu *) ptr; + + for (unsigned int i = 0; i < nvidia->count; i++) { + clean_device(&nvidia->devices[i]); + } + + free(nvidia->devices); + free(nvidia); + nvmlShutdown(); +} + diff --git a/src/nvidia_gpu.h b/src/nvidia_gpu.h new file mode 100644 index 0000000000000000000000000000000000000000..1f3d053d0e4513ee707d7e4658e8a0a7774fe54c --- /dev/null +++ b/src/nvidia_gpu.h @@ -0,0 +1,43 @@ +/******************************************************* + Copyright (C) 2023-2023 Georges Da Costa <georges.da-costa@irit.fr> + + This file is part of Mojitos. + + Mojitos is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Mojitos is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with MojitO/S. If not, see <https://www.gnu.org/licenses/>. + + *******************************************************/ + +unsigned int init_nvidia_gpu(char *, void **); +unsigned int get_nvidia_gpu(uint64_t *results, void *); +void clean_nvidia_gpu(void *); +void label_nvidia_gpu(char **labels, void *); + + +Sensor nvidia_gpu = { + .init = init_nvidia_gpu, + .get = get_nvidia_gpu, + .clean = clean_nvidia_gpu, + .label = label_nvidia_gpu, + .nb_opt = 1, +}; + +Optparse nvidia_gpu_opt[1] = { + { + .longname = "nvidia-gpu", + .shortname = 'n', + .argtype = OPTPARSE_NONE, + .usage_arg = NULL, + .usage_msg = "provides basic gpu information [clocks, memory, utilization, power, temperature].", + }, +};