Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
Batmen Tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
sepia-pub
mael
Batmen Tools
Commits
221240c1
Commit
221240c1
authored
2 years ago
by
Maël Madon
Browse files
Options
Downloads
Patches
Plain Diff
progress on distance tool. should maybe add a 'similarity' test as well
parent
c12f9c44
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
distance_batsim_output.py
+112
-5
112 additions, 5 deletions
distance_batsim_output.py
with
112 additions
and
5 deletions
distance_batsim_output.py
100644 → 100755
+
112
−
5
View file @
221240c1
...
...
@@ -3,24 +3,131 @@
"""
Compute a set of distances between two batsim outputs (_jobs.csv)
"""
import
pandas
as
pd
import
numpy
as
np
import
argparse
import
json
import
warnings
def
clean_and_select
(
df
):
"""
Select only desired column from the dataframe and clean the job_ids
"""
# Select
desired_cols
=
[
"
job_id
"
,
"
submission_time
"
,
"
starting_time
"
,
"
finish_time
"
]
desired_cols
=
[
"
job_id
"
,
"
submission_time
"
,
"
starting_time
"
,
"
finish_time
"
,
"
success
"
]
select
=
df
.
loc
[:,
desired_cols
]
# Clean job_id
# Clean job_id
(remove the sessions, if present)
select
.
job_id
=
select
.
job_id
.
astype
(
str
)
select
[
"
job_id
"
]
=
select
[
"
job_id
"
].
str
.
split
(
'
:
'
,
expand
=
True
)[
0
]
select
.
job_id
=
select
.
job_id
.
astype
(
int
)
return
select
.
sort_values
(
by
=
"
job_id
"
)
return
select
def
open_and_compare
(
file1
,
file2
):
"""
Open file1 and file2, two _jobs.csv files. Checks if the job_id
s match
and return their pandas Dataframe representation
"""
"""
Open file1 and file2, two _jobs.csv files. Checks if the job_id
columns
match
and return their
cleaned
pandas Dataframe representation
"""
out1
=
pd
.
read_csv
(
file1
)
out2
=
pd
.
read_csv
(
file2
)
out1
=
clean_and_select
(
out1
)
out2
=
clean_and_select
(
out2
)
if
not
out1
.
job_id
.
equals
(
out2
.
job_id
):
raise
KeyError
(
f
"
{
file1
}
and
{
file2
}
cannot be compared: they don
'
t have the same job_ids
"
)
if
not
out1
.
success
.
equals
(
out2
.
success
):
warnings
.
warn
(
f
"
Some jobs in
{
file1
}
and
{
file2
}
don
'
t have the same success status. Comparing only the jobs that were successful in both.
"
)
diff
=
out1
.
success
.
eq
(
out2
)
out1
=
out1
.
loc
[
diff
,
:]
out2
=
out2
.
loc
[
diff
,
:]
out1
=
out1
[
out1
.
success
==
1
]
out2
=
out2
[
out2
.
success
==
1
]
return
out1
,
out2
def
euclidean_distance
(
s1
,
s2
):
"""
Returns the Euclidean distance between two series s1 and s2
"""
dist
=
np
.
sqrt
(
np
.
sum
([(
x
-
y
)
*
(
x
-
y
)
for
x
,
y
in
zip
(
s1
,
s2
)]))
return
dist
def
lateness_distance
(
s1
,
s2
):
"""
Returns the
'
lateness
'
of s2 compared to s1
"""
return
np
.
sum
([
y
-
x
for
x
,
y
in
zip
(
s1
,
s2
)])
def
distances
(
file1
,
file2
,
euclidean
=
True
,
lateness
=
False
,
field
=
[
"
finish_time
"
]):
"""
Computes and returns a set of distances between two batsim outputs, if
they have the same job_ids.
"""
out1
,
out2
=
open_and_compare
(
file1
,
file2
)
dist
=
{}
for
f
in
field
:
dist
[
f
]
=
{}
if
euclidean
:
dist
[
f
][
"
euclidean
"
]
=
euclidean_distance
(
out1
[
f
],
out2
[
f
])
if
lateness
:
dist
[
f
][
"
lateness
"
]
=
lateness_distance
(
out1
[
f
],
out2
[
f
])
return
dist
def
pretty_print
(
dist
):
"""
Nice printing of the dictionnary dist
"""
if
len
(
dist
)
==
1
:
val
=
list
(
dist
.
values
())[
0
]
if
len
(
val
)
==
1
:
print
(
list
(
val
.
values
())[
0
])
else
:
pretty
=
json
.
dumps
(
dist
,
indent
=
4
)
print
(
pretty
)
def
main
():
"""
Program entry point if called with CLI
"""
parser
=
argparse
.
ArgumentParser
(
description
=
"
Computes and prints a set of distances between two batsim
"
"
outputs, if they have the same job_ids.
"
"
Default: euclidean distance on finish_time.
"
)
parser
.
add_argument
(
'
file1
'
,
type
=
str
,
help
=
'
The first _jobs.csv file
'
)
parser
.
add_argument
(
'
file2
'
,
type
=
str
,
help
=
'
The second _jobs.csv file
'
)
parser
.
add_argument
(
"
--type
"
,
nargs
=
'
+
'
,
default
=
[
'
euclidean
'
],
help
=
"
Type of distance to use. Available values are
"
"
{euclidean, lateness}
"
)
parser
.
add_argument
(
"
--field
"
,
nargs
=
'
+
'
,
default
=
[
'
finish_time
'
],
help
=
"
The field to use to compute the distance.
"
"
Available values are {submission_time, starting_time,
"
"
finish_time}
"
)
parser
.
add_argument
(
"
--all
"
,
action
=
"
store_true
"
,
help
=
"
Print all available distances on all available fields
"
)
args
=
parser
.
parse_args
()
if
args
.
all
:
args
.
type
=
[
"
euclidean
"
,
"
lateness
"
]
args
.
field
=
[
"
submission_time
"
,
"
starting_time
"
,
"
finish_time
"
]
dist
=
distances
(
file1
=
args
.
file1
,
file2
=
args
.
file2
,
euclidean
=
"
euclidean
"
in
args
.
type
,
lateness
=
"
lateness
"
in
args
.
type
,
field
=
list
(
args
.
field
))
pretty_print
(
dist
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment