Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MojitOS
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
sepia-pub
MojitOS
Commits
1a5443e3
Commit
1a5443e3
authored
2 years ago
by
floreal.risso
Browse files
Options
Downloads
Patches
Plain Diff
temperature/power
parent
bfb4c16b
No related branches found
No related tags found
2 merge requests
!9
fix sensor example (doc)
,
!5
Add dev name to labels
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/nvidia_gpu.c
+137
-18
137 additions, 18 deletions
src/nvidia_gpu.c
with
137 additions
and
18 deletions
src/nvidia_gpu.c
+
137
−
18
View file @
1a5443e3
...
@@ -37,8 +37,10 @@ typedef enum {
...
@@ -37,8 +37,10 @@ typedef enum {
CLOCK_SENSOR
=
0
,
CLOCK_SENSOR
=
0
,
MEMORY_SENSOR
=
1
,
MEMORY_SENSOR
=
1
,
UTILIZATION_SENSOR
=
2
,
UTILIZATION_SENSOR
=
2
,
POWER_SENSOR
=
3
,
TEMPERATURE_SENSOR
=
4
,
COUNT_SENSOR
=
3
,
COUNT_SENSOR
=
5
,
}
SENSOR_KIND
;
}
SENSOR_KIND
;
typedef
struct
Device
Device
;
typedef
struct
Device
Device
;
...
@@ -83,6 +85,7 @@ struct NvidiaGpu {
...
@@ -83,6 +85,7 @@ struct NvidiaGpu {
// -- Label template
// -- Label template
static
const
char
*
label_template
=
"gpu%u_%s_%s"
;
static
const
char
*
label_template
=
"gpu%u_%s_%s"
;
static
const
char
*
short_label_template
=
"gpu%u_%s"
;
// ----------------------------CLOCK_SENSOR
// ----------------------------CLOCK_SENSOR
...
@@ -106,17 +109,17 @@ unsigned int init_clock_sensor(const Device *device, void **data)
...
@@ -106,17 +109,17 @@ unsigned int init_clock_sensor(const Device *device, void **data)
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
unsigned
int
device_idx
=
device
->
idx
;
const
unsigned
int
device_idx
=
device
->
idx
;
ClockData
tmp
=
{
0
};
ClockData
tmp
=
{
0
};
nvmlReturn_t
result
;
nvmlReturn_t
err
;
unsigned
int
clock
;
unsigned
int
clock
;
// -- Test all clocks
// -- Test all clocks
for
(
unsigned
int
i
=
0
;
i
<
NVML_CLOCK_COUNT
;
i
++
)
{
for
(
unsigned
int
i
=
0
;
i
<
NVML_CLOCK_COUNT
;
i
++
)
{
if
((
result
=
nvmlDeviceGetClockInfo
(
nvml_device
,
clocks
[
i
],
&
clock
))
==
NVML_SUCCESS
)
{
if
((
err
=
nvmlDeviceGetClockInfo
(
nvml_device
,
clocks
[
i
],
&
clock
))
==
NVML_SUCCESS
)
{
snprintf
(
tmp
.
labels
[
tmp
.
count
],
CLOCK_LABEL_SIZE
,
label_template
,
device_idx
,
clock_base_name
,
clock_names
[
i
]);
snprintf
(
tmp
.
labels
[
tmp
.
count
],
CLOCK_LABEL_SIZE
,
label_template
,
device_idx
,
clock_base_name
,
clock_names
[
i
]);
tmp
.
clocks
[
tmp
.
count
]
=
clocks
[
i
];
tmp
.
clocks
[
tmp
.
count
]
=
clocks
[
i
];
tmp
.
count
+=
1
;
tmp
.
count
+=
1
;
}
else
{
}
else
{
fprintf
(
stderr
,
"Failed to get %s clock : %s
\n
"
,
clock_names
[
i
],
nvmlErrorString
(
result
));
fprintf
(
stderr
,
"Failed to get %s clock : %s
\n
"
,
clock_names
[
i
],
nvmlErrorString
(
err
));
}
}
}
}
...
@@ -189,9 +192,9 @@ unsigned int init_memory_sensor(const Device *device, void **data)
...
@@ -189,9 +192,9 @@ unsigned int init_memory_sensor(const Device *device, void **data)
const
unsigned
int
device_idx
=
device
->
idx
;
const
unsigned
int
device_idx
=
device
->
idx
;
nvmlMemory_t
memory
;
nvmlMemory_t
memory
;
nvmlReturn_t
result
;
nvmlReturn_t
err
;
if
((
result
=
nvmlDeviceGetMemoryInfo
(
nvml_device
,
&
memory
))
!=
NVML_SUCCESS
)
{
if
((
err
=
nvmlDeviceGetMemoryInfo
(
nvml_device
,
&
memory
))
!=
NVML_SUCCESS
)
{
fprintf
(
stderr
,
"Failed to get device memory : %s
\n
"
,
nvmlErrorString
(
result
));
fprintf
(
stderr
,
"Failed to get device memory : %s
\n
"
,
nvmlErrorString
(
err
));
return
0
;
return
0
;
}
}
...
@@ -210,9 +213,9 @@ unsigned int get_memory_sensor(uint64_t *results, const Device *device, void *no
...
@@ -210,9 +213,9 @@ unsigned int get_memory_sensor(uint64_t *results, const Device *device, void *no
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
nvmlDevice_t
nvml_device
=
device
->
device
;
nvmlMemory_t
memory
;
nvmlMemory_t
memory
;
nvmlReturn_t
result
;
nvmlReturn_t
err
;
if
((
result
=
nvmlDeviceGetMemoryInfo
(
nvml_device
,
&
memory
))
!=
NVML_SUCCESS
)
{
if
((
err
=
nvmlDeviceGetMemoryInfo
(
nvml_device
,
&
memory
))
!=
NVML_SUCCESS
)
{
fprintf
(
stderr
,
"Failed to get device memory : %s
\n
"
,
nvmlErrorString
(
result
));
fprintf
(
stderr
,
"Failed to get device memory : %s
\n
"
,
nvmlErrorString
(
err
));
exit
(
99
);
exit
(
99
);
}
}
...
@@ -259,10 +262,10 @@ unsigned int init_utilization_sensor(const Device *device, void **data)
...
@@ -259,10 +262,10 @@ unsigned int init_utilization_sensor(const Device *device, void **data)
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
unsigned
int
device_idx
=
device
->
idx
;
const
unsigned
int
device_idx
=
device
->
idx
;
nvmlReturn_t
result
;
nvmlReturn_t
err
;
nvmlUtilization_t
utilization
;
nvmlUtilization_t
utilization
;
if
((
result
=
nvmlDeviceGetUtilizationRates
(
nvml_device
,
&
utilization
))
!=
NVML_SUCCESS
)
{
if
((
err
=
nvmlDeviceGetUtilizationRates
(
nvml_device
,
&
utilization
))
!=
NVML_SUCCESS
)
{
fprintf
(
stderr
,
"Failed to get device utilization: %s
\n
"
,
nvmlErrorString
(
result
));
fprintf
(
stderr
,
"Failed to get device utilization: %s
\n
"
,
nvmlErrorString
(
err
));
return
0
;
return
0
;
}
}
...
@@ -280,10 +283,10 @@ unsigned int get_utilization_sensor(uint64_t *results, const Device *device, voi
...
@@ -280,10 +283,10 @@ unsigned int get_utilization_sensor(uint64_t *results, const Device *device, voi
UNUSED
(
none
);
UNUSED
(
none
);
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
nvmlDevice_t
nvml_device
=
device
->
device
;
nvmlReturn_t
result
;
nvmlReturn_t
err
;
nvmlUtilization_t
utilization
;
nvmlUtilization_t
utilization
;
if
((
result
=
nvmlDeviceGetUtilizationRates
(
nvml_device
,
&
utilization
))
!=
NVML_SUCCESS
)
{
if
((
err
=
nvmlDeviceGetUtilizationRates
(
nvml_device
,
&
utilization
))
!=
NVML_SUCCESS
)
{
fprintf
(
stderr
,
"Failed to get device utilization: %s
\n
"
,
nvmlErrorString
(
result
));
fprintf
(
stderr
,
"Failed to get device utilization: %s
\n
"
,
nvmlErrorString
(
err
));
exit
(
99
);
exit
(
99
);
}
}
...
@@ -308,8 +311,122 @@ void clean_utilization_sensor(void *data)
...
@@ -308,8 +311,122 @@ void clean_utilization_sensor(void *data)
free
(
data
);
free
(
data
);
}
}
// ----------------------------ERROR_SENSOR
// ----------------------------POWER_SENSOR
// TODO
#define POWER_LABEL_SIZE 25
#define COUNT_POWER 1
static
const
char
*
power_base_name
=
"power"
;
typedef
struct
{
char
label
[
POWER_LABEL_SIZE
];
}
PowerData
;
unsigned
int
init_power_sensor
(
const
Device
*
device
,
void
**
data
)
{
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
unsigned
int
device_idx
=
device
->
idx
;
unsigned
int
power
;
nvmlReturn_t
err
;
if
((
err
=
nvmlDeviceGetPowerUsage
(
nvml_device
,
&
power
))
!=
NVML_SUCCESS
)
{
printf
(
"Failed to get the device power consumption: %s
\n
"
,
nvmlErrorString
(
err
));
return
0
;
}
PowerData
*
power_data
=
(
PowerData
*
)
calloc
(
1
,
sizeof
(
PowerData
));
snprintf
(
power_data
->
label
,
POWER_LABEL_SIZE
,
short_label_template
,
device_idx
,
power_base_name
);
*
data
=
(
void
*
)
power_data
;
return
COUNT_POWER
;
}
unsigned
int
get_power_sensor
(
uint64_t
*
results
,
const
Device
*
device
,
void
*
none
)
{
UNUSED
(
none
);
const
nvmlDevice_t
nvml_device
=
device
->
device
;
unsigned
int
power
;
nvmlReturn_t
err
;
if
((
err
=
nvmlDeviceGetPowerUsage
(
nvml_device
,
&
power
))
!=
NVML_SUCCESS
)
{
printf
(
"Failed to get the device power consumption: %s
\n
"
,
nvmlErrorString
(
err
));
exit
(
99
);
}
*
results
=
power
;
return
COUNT_POWER
;
}
unsigned
int
label_power_sensor
(
char
**
labels
,
void
*
data
)
{
PowerData
*
power_data
=
(
PowerData
*
)
data
;
*
labels
=
power_data
->
label
;
return
COUNT_POWER
;
}
void
clean_power_sensor
(
void
*
data
)
{
free
(
data
);
}
// ----------------------TEMPERATURE_SENSOR
#define TEMPERATURE_LABEL_SIZE 35
#define COUNT_TEMPERATURE 1
static
const
char
*
temperature_base_name
=
"temperature"
;
typedef
struct
{
char
label
[
TEMPERATURE_LABEL_SIZE
];
}
TemperatureData
;
unsigned
int
init_temperature_sensor
(
const
Device
*
device
,
void
**
data
)
{
const
nvmlDevice_t
nvml_device
=
device
->
device
;
const
unsigned
int
device_idx
=
device
->
idx
;
unsigned
int
temperature
;
nvmlReturn_t
err
;
if
((
err
=
nvmlDeviceGetTemperature
(
nvml_device
,
NVML_TEMPERATURE_GPU
,
&
temperature
))
!=
NVML_SUCCESS
)
{
printf
(
"Failed to get the device temperature: %s
\n
"
,
nvmlErrorString
(
err
));
return
0
;
}
TemperatureData
*
temperature_data
=
(
TemperatureData
*
)
calloc
(
1
,
sizeof
(
TemperatureData
));
snprintf
(
temperature_data
->
label
,
TEMPERATURE_LABEL_SIZE
,
short_label_template
,
device_idx
,
temperature_base_name
);
*
data
=
(
void
*
)
temperature_data
;
return
COUNT_TEMPERATURE
;
}
unsigned
int
get_temperature_sensor
(
uint64_t
*
results
,
const
Device
*
device
,
void
*
none
)
{
UNUSED
(
none
);
const
nvmlDevice_t
nvml_device
=
device
->
device
;
unsigned
int
temperature
;
nvmlReturn_t
err
;
if
((
err
=
nvmlDeviceGetTemperature
(
nvml_device
,
NVML_TEMPERATURE_GPU
,
&
temperature
))
!=
NVML_SUCCESS
)
{
printf
(
"Failed to get the device temperature: %s
\n
"
,
nvmlErrorString
(
err
));
exit
(
99
);
}
*
results
=
temperature
;
return
COUNT_TEMPERATURE
;
}
unsigned
int
label_temperature_sensor
(
char
**
labels
,
void
*
data
)
{
TemperatureData
*
temperature_data
=
(
TemperatureData
*
)
data
;
*
labels
=
temperature_data
->
label
;
return
COUNT_TEMPERATURE
;
}
void
clean_temperature_sensor
(
void
*
data
)
{
free
(
data
);
}
// // Get the temperature
// result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
// if (NVML_SUCCESS != result) {
// printf("Failed to get temperature for device %d: %s\n", i, nvmlErrorString(result));
// continue;
// }
// printf("\t - temperature: %u\n", temperature);
// ----------------------------------------
// ----------------------------------------
...
@@ -319,6 +436,8 @@ static const ISensor avaible_sensors[COUNT_SENSOR] = {
...
@@ -319,6 +436,8 @@ static const ISensor avaible_sensors[COUNT_SENSOR] = {
{.
init
=
init_clock_sensor
,
.
get
=
get_clock_sensor
,
.
label
=
label_clock_sensor
,
.
clean
=
clean_clock_sensor
},
{.
init
=
init_clock_sensor
,
.
get
=
get_clock_sensor
,
.
label
=
label_clock_sensor
,
.
clean
=
clean_clock_sensor
},
{.
init
=
init_memory_sensor
,
.
get
=
get_memory_sensor
,
.
label
=
label_memory_sensor
,
.
clean
=
clean_memory_sensor
},
{.
init
=
init_memory_sensor
,
.
get
=
get_memory_sensor
,
.
label
=
label_memory_sensor
,
.
clean
=
clean_memory_sensor
},
{.
init
=
init_utilization_sensor
,
.
get
=
get_utilization_sensor
,
.
label
=
label_utilization_sensor
,
.
clean
=
clean_utilization_sensor
},
{.
init
=
init_utilization_sensor
,
.
get
=
get_utilization_sensor
,
.
label
=
label_utilization_sensor
,
.
clean
=
clean_utilization_sensor
},
{.
init
=
init_power_sensor
,
.
get
=
get_power_sensor
,
.
label
=
label_power_sensor
,
.
clean
=
clean_power_sensor
},
{.
init
=
init_temperature_sensor
,
.
get
=
get_temperature_sensor
,
.
label
=
label_temperature_sensor
,
.
clean
=
clean_temperature_sensor
},
};
};
// ------------------------DEVICE_FUNCTIONS
// ------------------------DEVICE_FUNCTIONS
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment