diff --git a/.gitignore b/.gitignore index a24ad23fcdab78e75eaae97de8d594531250fa68..c2538ab893d352032f5ed22c291d7c052f5c6a9a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ build test-out test-instances cover +result diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ea252d29ddb5b2315f9735c1002e9a156d924056..ebd2e1498e2494880aa3a26e125e3fb904043119 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,86 +4,33 @@ variables: GIT_SUBMODULE_STRATEGY: none stages: - - build - - update_dependencies_cache - - test - - coverage + - big_stage - deploy ############################################################################### -# Build stage +# Do most tasks ############################################################################### -build: - stage: build +build_and_test: + stage: big_stage script: - - ./ci/build.bash + # Build batsched + - nix-shell --pure ./release.nix -A batsched --command ${CI_PROJECT_DIR}/ci/list-store-paths-for-cachix.bash | cachix push batsim + - nix-build ./release.nix -A batsched && cp -rL result ./batsched + # Test against pinned batsim + - nix-shell --pure ./release.nix -A integration_tests --command ${CI_PROJECT_DIR}/ci/list-store-paths-for-cachix.bash | cachix push batsim + - nix-build ./release.nix -A integration_tests && cp -rL result ./integration_tests + # Test against up-to-date batsim + - nix-shell --pure ./release.nix -A integration_tests_batlatest --command ${CI_PROJECT_DIR}/ci/list-store-paths-for-cachix.bash | cachix push batsim + - nix-build ./release.nix -A integration_tests_batlatest && rm result + # Fail job if tests failed + - if [[ "$(cat ./integration_tests/pytest_returncode)" -ne 0 ]] ; then echo "pytest returned non-zero (against pinned batsim), aborting" ; exit 1 ; fi + - if [[ "$(cat ./integration_tests_batlatest/pytest_returncode)" -ne 0 ]] ; then echo "pytest returned non-zero (against latest batsim), aborting" ; exit 1 ; fi + # Send coverage results to codecov.io + - nix-env -i gcc + - mkdir -p merged + - cp ./batsched/gcno/* ./integration_tests/gcda/* merged/ + - bash <(curl -s https://codecov.io/bash) artifacts: + when: always paths: - - /builds/batsim/batsched/build - -############################################################################### -# Dependencies cache stage -############################################################################### -update_dependencies_cache: - stage: update_dependencies_cache - script: - - ./ci/update-dependencies-cache.bash - -############################################################################### -# Test stage -############################################################################### -test_pinned_batsim: - stage: test - script: - - nix-shell ./ci -A test_pinned --command 'bash ./ci/run-tests.bash' - dependencies: - - build - artifacts: - paths: - - /builds/batsim/batsched/build - -test_dev_batsim: - stage: test - script: - - nix-shell ./ci -A test_dev --command 'bash ./ci/run-tests.bash' - dependencies: - - build - -############################################################################### -# Coverage stage -############################################################################### -coverage: - stage: coverage - script: - - nix-shell ./ci -A test_deps_pinned --command 'bash ./ci/analyze-coverage.bash' - dependencies: - - test_pinned_batsim - artifacts: - paths: - - /builds/batsim/batsched/cover - -############################################################################### -# Deploy stage -############################################################################### -deploy_coverage: - stage: deploy - script: - # Pushes Batsim's code doc (doxygen) onto the gforge website. - # SSH setup (do NOT run these commands on your machine) - - eval $(ssh-agent -s) - - ssh-add <(echo "$SSH_PRIVATE_KEY") - - mkdir -p ~/.ssh - - '[[ -f /.dockerenv ]] && echo -e "Host *\n\tStrictHostKeyChecking no\n\n" >> ~/.ssh/config' - # Finally push the code documentation on the gforge website - - rsync -rlgoDz --delete cover/html/ mpoquet@scm.gforge.inria.fr:/home/groups/batsim/htdocs/batsched/coverage - dependencies: - - coverage - only: - - master - -deploy_batsched_dev_cachix: - stage: deploy - script: - - ./ci/update-batsched_dev-cache.bash - only: - - master + - /builds/batsim/batsched/integration_tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 9355a943f8fb79b08f9127c11873990550a120ea..4fe9fc7c2d33a28466cb082c94cd56be0a9bc263 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,16 @@ Batsched adheres to [Semantic Versioning][semver] and its public API is the foll [//]: ========================================================================= ## [Unreleased] +[//]: ========================================================================= +## [1.4.0] - 2020-07-29 - For [Batsim v4.0.0][Batsim v4.0.0] +### Added +- New `fcfs` algorithm (copied from `fcfs_fast`) that takes into account + the resources selector given in parameter. + +### Fixed +- The `easy_bf_fast` did not try to backfill previously submitted jobs in many + events (when the priority job could not be executed). + [//]: ========================================================================= ## [1.3.0] - 2019-01-15 - For [Batsim v3.0.0][Batsim v3.0.0] ### Added @@ -87,8 +97,10 @@ Initial release. [Batsim v2.0.0]: https://batsim.readthedocs.io/en/latest/changelog.html#v2-0-0 [Batsim v3.0.0]: https://batsim.readthedocs.io/en/latest/changelog.html#v3-0-0 +[Batsim v4.0.0]: https://batsim.readthedocs.io/en/latest/changelog.html#v4-0-0 -[Unreleased]: https://gitlab.inria.fr/batsim/batsched/compare/v1.3.0...master +[Unreleased]: https://gitlab.inria.fr/batsim/batsched/compare/v1.4.0...master +[1.4.0]: https://gitlab.inria.fr/batsim/batsched/compare/v1.3.0...v1.4.0 [1.3.0]: https://gitlab.inria.fr/batsim/batsched/compare/v1.2.1...v1.3.0 [1.2.1]: https://gitlab.inria.fr/batsim/batsched/compare/v1.2.0...v1.2.1 [1.2.0]: https://gitlab.inria.fr/batsim/batsched/compare/v1.1.0...v1.2.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 84048413b7a10e7baefae843c8916b1906bc4da3..592713aa196b56f7bc91acc92b472f098af68fc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,7 +122,7 @@ endif() #################### # Batsched version # #################### -set(default_batsched_version "v1.3.0") +set(default_batsched_version "v1.4.0") include(GetGitRevisionDescription) git_describe(batsched_version) message(STATUS "Batsched version from git: ${batsched_version}") diff --git a/README.md b/README.md index 66b5aa6867bb792b7433a344704412af08dbcc8f..8c92051664d8be7d2f1c4cf1f6f3af1f15669f1f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [](https://framagit.org/batsim/batsched/pipelines) -[](http://batsim.gforge.inria.fr/batsched/coverage/) +[](https://codecov.io/gh/oar-team/batsched) [](./CHANGELOG.md) **batsched** is a set of [Batsim]-compatible algorithms implemented in C++. @@ -8,9 +8,9 @@ ### For [Nix] users ``` bash # Up-to-date version -nix-env -iA batsched_dev -f 'https://github.com/oar-team/kapack/archive/master.tar.gz' +nix-env -iA batsched-master -f 'https://github.com/oar-team/nur-kapack/archive/master.tar.gz' # Latest release -nix-env -iA batsched -f 'https://github.com/oar-team/kapack/archive/master.tar.gz' +nix-env -iA batsched -f 'https://github.com/oar-team/nur-kapack/archive/master.tar.gz' ``` ### Manually @@ -23,14 +23,14 @@ make make install ``` -Up-to-date dependencies and versions are fully defined in [batsched's CI nix recipe](./default.nix). +Up-to-date dependencies and versions are fully defined in [batsched's CI nix recipe](./release.nix). Here is a quick (and probably outdated) list: - decent clang/gcc and cmake - zmq (C and C++) - redox (hiredis + libev) - [loguru] - [intervalset] -- decent boost, gmp, rapidjson, openssl... +- decent boost, gmp, rapidjson... [Batsim]: https://framagit.org/batsim/batsim/ [intervalset]: https://framagit.org/batsim/intervalset diff --git a/ci/README.md b/ci/README.md deleted file mode 100644 index 515fa240ef3c0699129a35cbcd87ff023bdb7310..0000000000000000000000000000000000000000 --- a/ci/README.md +++ /dev/null @@ -1,60 +0,0 @@ -This directory is essentially a nix repository with some scripts. - -# Packages overview -## batsched_local -This is the current version of batsched (built from current file hierarchy). - -## batsim_pinned -This is the current *reference* Batsim version for batsched. -batsched should always work with this version. - -## batsim_dev -This is the up-to-date Batsim version. -batsched should work with this version. - -## test_deps_(pinned|dev) -The list of packages needed to run tests. -This is meant to be used as a shell, not meant to be installed. - -## test_(pinned|dev) -A shell used to run tests. Essentially batsim_local + test_deps. -Not meant to be installed either. - -# Useful commands -In all the following examples, the current directory is expected to be -batsched's root. - -## Building packages -This can be done via `nix-build`. Result will be in `./result/`. -Some examples: -``` bash -nix-build ./ci -A batsched_local -nix-build ./ci -A batsim_pinned -``` - -## Install packages -This is done via `nix-env`: -``` bash -nix-env -f ./ci -iA batsched_local -nix-env -f ./ci -iA batsim_dev -``` - -To uninstall them, use `nix-env --uninstall`. - -## Get into a shell to build packages -`nix-shell` is your friend here. Example: -``` bash -nix-shell ./ci -A batsched_local -# your shell now has all batsched's build dependencies! -# you can freely build the project (cmake, make...) -``` - -## Run the tests -This is essentially "run the test script in the desired environment". -``` bash -# test current batsched with batsim_pinned: -nix-shell ./ci -A test_pinned --command './ci/run-tests.bash' - -# or test with batsim_dev: -nix-shell ./ci -A test_dev --command './ci/run-tests.bash' -``` diff --git a/ci/analyze-coverage.bash b/ci/analyze-coverage.bash deleted file mode 100755 index 6422ee4854abaeb476d54fb3a26a3489ab5c4523..0000000000000000000000000000000000000000 --- a/ci/analyze-coverage.bash +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell . -i bash -A test_deps_pinned -set -eu - -echo "Prepare directories" -rm -rf ./cover -mkdir -p ./cover/tmp -cd ./cover/tmp - -echo "Call gcov" -gcov_files=$(find ../../build -name '*.gcda') -for gcov_file in ${gcov_files[@]}; do - gcov ${gcov_file} 1>/dev/null 2>&1 -done - -echo "Only keep interesting files" -interesting_sources=$(find ../../src -name '*.?pp' | sort | grep -v 'pempek_assert\|taywee_args') -set +e -for interesting_source in ${interesting_sources[@]}; do - interesting_file="./$(basename ${interesting_source}).gcov" - cp -f ${interesting_file} ../ 2>/dev/null -done -set -e - -cd ../.. -rm -rf ./cover/tmp - -echo "Run gcovr analysis (human-readable report)" -gcovr -gk -o ./cover/summary.txt -cat ./cover/summary.txt - -echo "Run gcovr analysis (html report)" -rm -rf ./cover/html -mkdir -p ./cover/html -gcovr -gk --html-details -o ./cover/html/index.html - -exit 0 diff --git a/ci/build.bash b/ci/build.bash deleted file mode 100755 index 5f9e7a5164dee0730c8430165be8ca2c6ff113d6..0000000000000000000000000000000000000000 --- a/ci/build.bash +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell . -i bash -A batsched_local -set -eux - -# Start from a clean build directory -rm -rf ./build -mkdir -p ./build - -# Usual cmake build stuff -cd ./build -cmake .. \ - -DCMAKE_BUILD_TYPE=Debug \ - -Denable_warnings=ON \ - -Dtreat_warnings_as_errors=OFF \ - -Ddo_coverage=ON -make -j $(nproc) diff --git a/ci/default.nix b/ci/default.nix deleted file mode 100644 index d5c64a584befeac96bca3eb718be1dcf66c348a8..0000000000000000000000000000000000000000 --- a/ci/default.nix +++ /dev/null @@ -1,53 +0,0 @@ -let - pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/18.03.tar.gz") {}; - kapack = import - ( fetchTarball "https://github.com/oar-team/kapack/archive/master.tar.gz") {}; -in - -let - callPackage = pkgs.lib.callPackageWith (pkgs // pkgs.xlibs // self // kapack); - self = rec { - inherit pkgs kapack; - - # Redefine some packages for clarity's sake - batexpe = kapack.batexpe; - batsim_pinned = (kapack.batsim.override {simgrid = kapack.simgrid_dev_working; }).overrideAttrs (attrs: rec { - name = "batsim-${version}"; - version = "3.0.0-pinned"; - src = pkgs.fetchgit { - url = "https://framagit.org/batsim/batsim.git"; - rev = "12db5085210ac24d82657b21fafe0ca198dcf48d"; - sha256 = "07b9npm5qvrzanp14rwp743dxsh7dwpvpywmlpxla5j4kxk665hc"; - }; - }); - batsim_dev = (kapack.batsim.override {simgrid = kapack.simgrid_dev_working; }).overrideAttrs (attrs: rec { - nativeBuildInputs = attrs.nativeBuildInputs ++ [kapack.intervalset]; - name = "batsim-${version}"; - version = "3.1.0-dev"; - src = fetchTarball "https://gitlab.inria.fr/batsim/batsim/repository/master/archive.tar.gz"; - }); - - pytest = pkgs.python36Packages.pytest; - gcovr = kapack.gcovr; - - # Packages defined in this tree - batsched_local = callPackage ./local.nix {}; - test_deps_pinned = callPackage ./test-deps.nix { - batsim = batsim_pinned; - }; - test_deps_dev = callPackage ./test-deps.nix { - batsim = batsim_dev; - }; - - # Packages meant to be used as shells - test_pinned = callPackage ./test-env.nix { - batsched = batsched_local; - test_deps = test_deps_pinned; - }; - test_dev = callPackage ./test-env.nix { - batsched = batsched_local; - test_deps = test_deps_dev; - }; - }; -in - self diff --git a/ci/list-store-paths-for-cachix.bash b/ci/list-store-paths-for-cachix.bash new file mode 100755 index 0000000000000000000000000000000000000000..399c132a7ce7aad1e89a139de2d70c37edbf0d84 --- /dev/null +++ b/ci/list-store-paths-for-cachix.bash @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +function list_store_paths_for_cachix { + var_path=$1 + echo $var_path | tr ':' '\n' | sed -E -n 'sW(/nix/store/.*)/.*W\1Wp' +} + +list_store_paths_for_cachix ${CMAKE_INCLUDE_PATH} +list_store_paths_for_cachix ${CMAKE_LIBRARY_PATH} +list_store_paths_for_cachix ${PATH} +list_store_paths_for_cachix ${PYTHONPATH} diff --git a/ci/local.nix b/ci/local.nix deleted file mode 100644 index 1c65e739b1ca0a5b5d43b54b975699a89c561067..0000000000000000000000000000000000000000 --- a/ci/local.nix +++ /dev/null @@ -1,23 +0,0 @@ -{ stdenv, batsched_dev }: - -(batsched_dev.override {}).overrideAttrs (attrs: rec { - name = "batsched-1.4.0-nix-ci"; - src = stdenv.lib.sourceByRegex ../. [ - "^src$" - "^src/algo$" - "^src/external$" - ".*\.cpp$" ".*\.hpp$" - "^cmake$" - "^cmake/Modules$" - ".*\.cmake" - ".*\.cmake.in" - "^CMakeLists\.txt$" - ]; - enableParallelBuilding = true; - doCheck = false; - - preConfigure = '' - # Always start from a clean build directory - rm -rf ./build - ''; -}) diff --git a/ci/pin-batsim.bash b/ci/pin-batsim.bash deleted file mode 100755 index e0d89bea92484b95da1bbf1c2cf300db24ca68c1..0000000000000000000000000000000000000000 --- a/ci/pin-batsim.bash +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell . -i bash -A test_deps_dev - -if [ "$#" -ne 1 ]; then - echo 'usage: pin-batsim.bash BATSIM-REV' - exit 1 -fi - -rev=$1 -nix-prefetch-git \ - --url https://framagit.org/batsim/batsim.git \ - --rev ${rev} \ - > batsim-pinned.json diff --git a/ci/run-tests.bash b/ci/run-tests.bash deleted file mode 100755 index 68037744a8059e577e159b0769e85c9a02a6761a..0000000000000000000000000000000000000000 --- a/ci/run-tests.bash +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell . -i bash -A test_deps_pinned -set -eu - -initial_dir=$(realpath .) - -# Run a redis server if needed -redis_launched_here=0 -r=$(ps faux | grep redis-server | grep -v grep | wc -l) -if [ $r -eq 0 ] -then - echo "Running a Redis server..." - redis-server>/dev/null & - redis_launched_here=1 - - while ! nc -z localhost 6379; do - sleep 1 - done -fi - -# Add built batsched in PATH -export PATH=$(realpath ./build):${PATH} - -# Set TEST_ROOT so simulation input files can be found -export TEST_ROOT=$(realpath ./test) - -# Print which versions are used -echo "batsched realpath: $(realpath $(which batsched))" -echo "batsim realpath: $(realpath $(which batsim))" -echo "robin realpath: $(realpath $(which robin))" - -# Execute the tests (TODO: clean tests) -cd test -pytest -failed=$? - -# Stop the redis server if it has been launched by this script -if [ $redis_launched_here -eq 1 ] -then - echo "Stopping the Redis server..." - killall redis-server -fi - -cd ${initial_dir} -exit ${failed} diff --git a/ci/test-deps.nix b/ci/test-deps.nix deleted file mode 100644 index d81183dd98e4fc6f0b9993118884adcf433f5ebe..0000000000000000000000000000000000000000 --- a/ci/test-deps.nix +++ /dev/null @@ -1,16 +0,0 @@ -{ stdenv, batsim, batexpe, - which, redis, procps, psmisc, pytest, gcovr, - nix-prefetch-git -}: - -stdenv.mkDerivation rec { - name = "batsched-test-deps"; - - # This package is not meant to be built - unpackPhase = "true"; - installPhase = "true"; - propagatedBuildInputs = [ batsim batexpe - which redis procps psmisc pytest gcovr - nix-prefetch-git - ]; -} diff --git a/ci/test-env.nix b/ci/test-env.nix deleted file mode 100644 index 9c4fa51eadb9a0ffd6be1d1dc790e9d7e78a9656..0000000000000000000000000000000000000000 --- a/ci/test-env.nix +++ /dev/null @@ -1,10 +0,0 @@ -{ stdenv, batsched, test_deps }: - -stdenv.mkDerivation rec { - name = "batsched-test-env"; - - # This package is not meant to be built - unpackPhase = "true"; - installPhase = "true"; - propagatedBuildInputs = [ batsched test_deps ]; -} diff --git a/ci/update-batsched_dev-cache.bash b/ci/update-batsched_dev-cache.bash deleted file mode 100755 index ff0f9b642f276c17144fc8512532d2c2f100ce1a..0000000000000000000000000000000000000000 --- a/ci/update-batsched_dev-cache.bash +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell -i bash -p nix -set -eu - -# Build up-to-date batsched_dev package, push it on binary cache -nix-build https://github.com/oar-team/kapack/archive/master.tar.gz -A batsched_dev | cachix push batsim diff --git a/ci/update-dependencies-cache.bash b/ci/update-dependencies-cache.bash deleted file mode 100755 index 0684be70f8034e773032416ead06d0e57f33d52e..0000000000000000000000000000000000000000 --- a/ci/update-dependencies-cache.bash +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env nix-shell -#! nix-shell -i bash -p nix -set -eu - -# (re)build up-to-date CI batsched package + deps, push them on binary cache -nix-build ./ci -A test_pinned | cachix push batsim -nix-build ./ci -A test_dev | cachix push batsim diff --git a/default.nix b/default.nix deleted file mode 100644 index 05753d944d6b6285c319469805cd8ad806f7341a..0000000000000000000000000000000000000000 --- a/default.nix +++ /dev/null @@ -1,16 +0,0 @@ -let - pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/18.03.tar.gz") {}; - kapack = import - ( fetchTarball "https://github.com/oar-team/kapack/archive/master.tar.gz") - { inherit pkgs; }; -in - -with kapack; -with pkgs; - -(batsched_dev.override {}).overrideAttrs (attrs: rec { - name = "batsched-1.4.0-nix-local"; - src = ../.; - enableParallelBuilding = true; - doCheck = false; -}) diff --git a/default.nix b/default.nix new file mode 120000 index 0000000000000000000000000000000000000000..ca14ab7c79af97de6de0b131a026970eadb5025c --- /dev/null +++ b/default.nix @@ -0,0 +1 @@ +release.nix \ No newline at end of file diff --git a/meson.build b/meson.build new file mode 100644 index 0000000000000000000000000000000000000000..175775ef51c136503dd0a065f8b7ee716fd5ef53 --- /dev/null +++ b/meson.build @@ -0,0 +1,110 @@ +project('batsched', 'cpp', + version: '1.4.0', + license: 'free', + default_options: ['cpp_std=c++11'], + meson_version: '>=0.40.0' +) + +# Dependencies +boost_dep = dependency('boost', + modules : ['locale', 'regex', 'system'] +) +rapidjson_dep = dependency('RapidJSON') +redox_dep = dependency('redox') +libzmq_dep = dependency('libzmq') +loguru_dep = dependency('loguru') +intervalset_dep = dependency('intervalset') +gmpxx_dep = dependency('gmpxx') + +batsched_deps = [ + boost_dep, + rapidjson_dep, + redox_dep, + libzmq_dep, + loguru_dep, + intervalset_dep, + gmpxx_dep +] + +# Source files +src = [ + 'src/algo/conservative_bf.cpp', + 'src/algo/conservative_bf.hpp', + 'src/algo/crasher.cpp', + 'src/algo/crasher.hpp', + 'src/algo/easy_bf.cpp', + 'src/algo/easy_bf_fast.cpp', + 'src/algo/easy_bf_fast.hpp', + 'src/algo/easy_bf.hpp', + 'src/algo/easy_bf_plot_liquid_load_horizon.cpp', + 'src/algo/easy_bf_plot_liquid_load_horizon.hpp', + 'src/algo/energy_bf.cpp', + 'src/algo/energy_bf_dicho.cpp', + 'src/algo/energy_bf_dicho.hpp', + 'src/algo/energy_bf.hpp', + 'src/algo/energy_bf_idle_sleeper.cpp', + 'src/algo/energy_bf_idle_sleeper.hpp', + 'src/algo/energy_bf_machine_subpart_sleeper.cpp', + 'src/algo/energy_bf_machine_subpart_sleeper.hpp', + 'src/algo/energy_bf_monitoring_inertial_shutdown.cpp', + 'src/algo/energy_bf_monitoring_inertial_shutdown.hpp', + 'src/algo/energy_bf_monitoring_period.cpp', + 'src/algo/energy_bf_monitoring_period.hpp', + 'src/algo/energy_watcher.cpp', + 'src/algo/energy_watcher.hpp', + 'src/algo/fcfs_fast.cpp', + 'src/algo/fcfs_fast.hpp', + 'src/algo/fcfs.cpp', + 'src/algo/fcfs.hpp', + 'src/algo/filler.cpp', + 'src/algo/filler.hpp', + 'src/algo/killer2.cpp', + 'src/algo/killer2.hpp', + 'src/algo/killer.cpp', + 'src/algo/killer.hpp', + 'src/algo/random.cpp', + 'src/algo/random.hpp', + 'src/algo/rejecter.cpp', + 'src/algo/rejecter.hpp', + 'src/algo/sequencer.cpp', + 'src/algo/sequencer.hpp', + 'src/algo/sleeper.cpp', + 'src/algo/sleeper.hpp', + 'src/algo/submitter.cpp', + 'src/algo/submitter.hpp', + 'src/algo/wt_estimator.cpp', + 'src/algo/wt_estimator.hpp', + 'src/data_storage.cpp', + 'src/data_storage.hpp', + 'src/decision.cpp', + 'src/decision.hpp', + 'src/exact_numbers.hpp', + 'src/external/taywee_args.hpp', + 'src/isalgorithm.cpp', + 'src/isalgorithm.hpp', + 'src/json_workload.cpp', + 'src/json_workload.hpp', + 'src/locality.cpp', + 'src/locality.hpp', + 'src/main.cpp', + 'src/network.cpp', + 'src/network.hpp', + 'src/pempek_assert.cpp', + 'src/pempek_assert.hpp', + 'src/protocol.cpp', + 'src/protocol.hpp', + 'src/queue.cpp', + 'src/queue.hpp', + 'src/queueing_theory_waiting_time_estimator.cpp', + 'src/queueing_theory_waiting_time_estimator.hpp', + 'src/schedule.cpp', + 'src/schedule.hpp' +] +include_dir = include_directories('src') + +batsched = executable('batsched', src, + include_directories: include_dir, + dependencies: batsched_deps, + cpp_args: '-DBATSCHED_VERSION=@0@'.format(meson.project_version()), + install: true +) diff --git a/release.nix b/release.nix new file mode 100644 index 0000000000000000000000000000000000000000..4a4a88360b37fc0eb8e2018c32e72268e7adb7e6 --- /dev/null +++ b/release.nix @@ -0,0 +1,147 @@ +{ kapack ? import + (fetchTarball "https://github.com/oar-team/nur-kapack/archive/master.tar.gz") + {} +, doCheck ? false +, doCoverage ? true +, batsim ? kapack.batsim +, batsim-master ? kapack.batsim-master +, batexpe ? kapack.batexpe +}: + +let + pkgs = kapack.pkgs; + pythonPackages = pkgs.python37Packages; + buildPythonPackage = pythonPackages.buildPythonPackage; + + jobs = rec { + # Batsched executable file (built from local sources) + batsched = kapack.batsched.overrideAttrs (attr: rec { + src = pkgs.lib.sourceByRegex ./. [ + "^src" + "^src/.*\.?pp" + "^src/algo" + "^src/algo/.*\.?pp" + "^src/external" + "^src/external/.*\.?pp" + "^meson\.build" + ]; + mesonFlags = [] + ++ pkgs.lib.optional doCoverage [ "-Db_coverage=true" ]; + nativeBuildInputs = with kapack; [pkgs.meson pkgs.ninja pkgs.pkgconfig + pkgs.boost pkgs.gmp pkgs.rapidjson intervalset loguru redox pkgs.cppzmq pkgs.zeromq]; + # Debug build, without any Nix stripping magic. + mesonBuildType = "debug"; + hardeningDisable = [ "all" ]; + dontStrip = true; + # Keep files generated by GCOV, so depending jobs can use them. + postInstall = pkgs.lib.optionalString doCoverage '' + mkdir -p $out/gcno + cp batsched@exe/*.gcno $out/gcno/ + ''; + }); + + # Batsched integration tests. + integration_tests = pkgs.stdenv.mkDerivation rec { + pname = "batsched-integration-tests"; + version = toString builtins.currentTime; # Forces rebuild + src = pkgs.lib.sourceByRegex ./. [ + "^test" + "^test/.*\.py" + "^test/platforms" + "^test/platforms/.*\.xml" + "^test/workloads" + "^test/workloads/.*\.json" + ]; + buildInputs = with pkgs.python37Packages; [ + batsim batsched batexpe pkgs.redis + pytest pytest_html pandas]; + preBuild = pkgs.lib.optionalString doCoverage '' + mkdir -p gcda + export GCOV_PREFIX=$(realpath gcda) + export GCOV_PREFIX_STRIP=5 + ''; + buildPhase = '' + runHook preBuild + set +e + (cd test && pytest -ra --html=../report/pytest_report.html) + echo $? > ./pytest_returncode + set -e + ''; + checkPhase = '' + pytest_return_code=$(cat ./pytest_returncode) + echo "pytest return code: $pytest_return_code" + if [ $pytest_return_code -ne 0 ] ; then + exit 1 + fi + ''; + inherit doCheck; + installPhase = '' + mkdir -p $out + mv ./report/* ./pytest_returncode $out/ + '' + pkgs.lib.optionalString doCoverage '' + mv ./gcda $out/ + ''; + }; + # Essentially the same as integration_tests, but with an up-to-date Batsim. + integration_tests_batlatest = integration_tests.overrideAttrs (attr: rec { + buildInputs = with pkgs.python37Packages; [ + batsim-master batsched batexpe pkgs.redis + pytest pytest_html pandas]; + }); + + # Batsched doxygen documentation. + doxydoc = pkgs.stdenv.mkDerivation rec { + name = "batsim-doxygen-documentation"; + src = pkgs.lib.sourceByRegex ./. [ + "^src" + "^src/.*\.?pp" + "^doc" + "^doc/Doxyfile" + "^doc/doxygen_mainpage.md" + ]; + buildInputs = [pkgs.doxygen]; + buildPhase = "(cd doc && doxygen)"; + installPhase = '' + mkdir -p $out + mv doc/doxygen_doc/html/* $out/ + ''; + checkPhase = '' + nb_warnings=$(cat doc/doxygen_warnings.log | wc -l) + if [[ $nb_warnings -gt 0 ]] ; then + echo "FAILURE: There are doxygen warnings!" + cat doc/doxygen_warnings.log + exit 1 + fi + ''; + doCheck = true; + }; + + # Dependencies not in nixpkgs as I write these lines. + pytest_metadata = buildPythonPackage { + name = "pytest-metadata-1.8.0"; + doCheck = false; + propagatedBuildInputs = [ + pythonPackages.pytest + pythonPackages.setuptools_scm + ]; + src = builtins.fetchurl { + url = "https://files.pythonhosted.org/packages/12/38/eed3a1e00c765e4da61e4e833de41c3458cef5d18e819d09f0f160682993/pytest-metadata-1.8.0.tar.gz"; + sha256 = "1fk6icip2x1nh4kzhbc8cnqrs77avpqvj7ny3xadfh6yhn9aaw90"; + }; + }; + + pytest_html = buildPythonPackage { + name = "pytest-html-1.20.0"; + doCheck = false; + propagatedBuildInputs = [ + pythonPackages.pytest + pytest_metadata + ]; + src = builtins.fetchurl { + url = "https://files.pythonhosted.org/packages/08/3e/63d998f26c7846d3dac6da152d1b93db3670538c5e2fe18b88690c1f52a7/pytest-html-1.20.0.tar.gz"; + sha256 = "17jyn4czkihrs225nkpj0h113hc03y0cl07myb70jkaykpfmrim7"; + }; + }; + }; +in + jobs diff --git a/src/algo/conservative_bf.cpp b/src/algo/conservative_bf.cpp index ec44cf14821b9edc47b94be94499c0c042fef835..fd165e7b8444c3f178d0ec9c0580502732c76738 100644 --- a/src/algo/conservative_bf.cpp +++ b/src/algo/conservative_bf.cpp @@ -1,11 +1,28 @@ #include "conservative_bf.hpp" +#include <loguru.hpp> + +#include "../pempek_assert.hpp" + using namespace std; ConservativeBackfilling::ConservativeBackfilling(Workload *workload, SchedulingDecision *decision, Queue *queue, ResourceSelector * selector, double rjms_delay, rapidjson::Document *variant_options) : ISchedulingAlgorithm(workload, decision, queue, selector, rjms_delay, variant_options) { + if (variant_options->HasMember("dump_previsional_schedules")) + { + PPK_ASSERT_ERROR((*variant_options)["dump_previsional_schedules"].IsBool(), + "Invalid options: 'dump_previsional_schedules' should be a boolean"); + _dump_provisional_schedules = (*variant_options)["dump_previsional_schedules"].GetBool(); + } + + if (variant_options->HasMember("dump_prefix")) + { + PPK_ASSERT_ERROR((*variant_options)["dump_prefix"].IsString(), + "Invalid options: 'dump_prefix' should be a string"); + _dump_prefix = (*variant_options)["dump_prefix"].GetString(); + } } ConservativeBackfilling::~ConservativeBackfilling() @@ -32,14 +49,26 @@ void ConservativeBackfilling::make_decisions(double date, _schedule.remove_job((*_workload)[ended_job_id]); // Let's handle recently released jobs + std::vector<std::string> recently_queued_jobs; for (const string & new_job_id : _jobs_released_recently) { const Job * new_job = (*_workload)[new_job_id]; if (new_job->nb_requested_resources > _nb_machines) + { _decision->add_reject_job(new_job_id, date); + } + else if (!new_job->has_walltime) + { + LOG_SCOPE_FUNCTION(INFO); + LOG_F(INFO, "Date=%g. Rejecting job '%s' as it has no walltime", date, new_job_id.c_str()); + _decision->add_reject_job(new_job_id, date); + } else + { _queue->append_job(new_job, update_info); + recently_queued_jobs.push_back(new_job_id); + } } // Let's update the schedule's present @@ -51,7 +80,7 @@ void ConservativeBackfilling::make_decisions(double date, // If no resources have been released, we can just insert the new jobs into the schedule if (_jobs_ended_recently.empty()) { - for (const string & new_job_id : _jobs_released_recently) + for (const string & new_job_id : recently_queued_jobs) { const Job * new_job = (*_workload)[new_job_id]; Schedule::JobAlloc alloc = _schedule.add_job_first_fit(new_job, _selector); @@ -78,7 +107,11 @@ void ConservativeBackfilling::make_decisions(double date, const Job * job = (*job_it)->job; _schedule.remove_job_if_exists(job); +// if (_dump_provisional_schedules) +// _schedule.incremental_dump_as_batsim_jobs_file(_dump_prefix); Schedule::JobAlloc alloc = _schedule.add_job_first_fit(job, _selector); +// if (_dump_provisional_schedules) +// _schedule.incremental_dump_as_batsim_jobs_file(_dump_prefix); if (alloc.started_in_first_slice) { @@ -94,8 +127,11 @@ void ConservativeBackfilling::make_decisions(double date, for (const std::string & job_id : _jobs_whose_waiting_time_estimation_has_been_requested_recently) { - const Job * new_job = (*_workload)[job_id]; - double answer = _schedule.query_wait(new_job->nb_requested_resources, new_job->walltime, _selector); - _decision->add_answer_estimate_waiting_time(job_id, answer, date); + const Job * new_job = (*_workload)[job_id]; + double answer = _schedule.query_wait(new_job->nb_requested_resources, new_job->walltime, _selector); + _decision->add_answer_estimate_waiting_time(job_id, answer, date); } + + if (_dump_provisional_schedules) + _schedule.incremental_dump_as_batsim_jobs_file(_dump_prefix); } diff --git a/src/algo/conservative_bf.hpp b/src/algo/conservative_bf.hpp index 3f2c5c423177d6dd6218eae6b5635196495da378..717a991479edb20b21ef1e992093d4e609820beb 100644 --- a/src/algo/conservative_bf.hpp +++ b/src/algo/conservative_bf.hpp @@ -24,4 +24,6 @@ public: private: Schedule _schedule; + bool _dump_provisional_schedules = false; + std::string _dump_prefix = "/tmp/dump"; }; diff --git a/src/algo/easy_bf.cpp b/src/algo/easy_bf.cpp index a8a2d9d0048b111a8ecb9dcb483a70092bc587cb..fe5ef9bec1e31e83cb8c0589a75cc56080f892ad 100644 --- a/src/algo/easy_bf.cpp +++ b/src/algo/easy_bf.cpp @@ -43,14 +43,26 @@ void EasyBackfilling::make_decisions(double date, _schedule.remove_job((*_workload)[ended_job_id]); // Let's handle recently released jobs + std::vector<std::string> recently_queued_jobs; for (const string & new_job_id : _jobs_released_recently) { const Job * new_job = (*_workload)[new_job_id]; if (new_job->nb_requested_resources > _nb_machines) + { + _decision->add_reject_job(new_job_id, date); + } + else if (!new_job->has_walltime) + { + LOG_SCOPE_FUNCTION(INFO); + LOG_F(INFO, "Date=%g. Rejecting job '%s' as it has no walltime", date, new_job_id.c_str()); _decision->add_reject_job(new_job_id, date); + } else + { _queue->append_job(new_job, update_info); + recently_queued_jobs.push_back(new_job_id); + } } // Let's update the schedule's present @@ -65,9 +77,9 @@ void EasyBackfilling::make_decisions(double date, { int nb_available_machines = _schedule.begin()->available_machines.size(); - for (unsigned int i = 0; i < _jobs_released_recently.size() && nb_available_machines > 0; ++i) + for (unsigned int i = 0; i < recently_queued_jobs.size() && nb_available_machines > 0; ++i) { - const string & new_job_id = _jobs_released_recently[i]; + const string & new_job_id = recently_queued_jobs[i]; const Job * new_job = (*_workload)[new_job_id]; // The job could have already been executed by sort_queue_while_handling_priority_job, diff --git a/src/algo/easy_bf_fast.cpp b/src/algo/easy_bf_fast.cpp index 86d852cdb17b7448ad00e7692b74b82586daab97..fbb37c46a3aff419a32856abc537fce379e0ec64 100644 --- a/src/algo/easy_bf_fast.cpp +++ b/src/algo/easy_bf_fast.cpp @@ -1,5 +1,7 @@ #include "easy_bf_fast.hpp" +//#include <loguru.hpp> + #include "../pempek_assert.hpp" EasyBackfillingFast::EasyBackfillingFast(Workload *workload, @@ -72,15 +74,17 @@ void EasyBackfillingFast::make_decisions(double date, { if (_priority_job != nullptr) { + Allocation alloc; + FinishedHorizonPoint point; + if (_priority_job->nb_requested_resources <= _nb_available_machines) { - Allocation alloc; + //LOG_F(INFO, "Priority job fits!"); alloc.machines = _available_machines.left( _priority_job->nb_requested_resources); _decision->add_execute_job(_priority_job->id, alloc.machines, date); - FinishedHorizonPoint point; point.nb_released_machines = _priority_job->nb_requested_resources; point.date = date + (double)_priority_job->walltime; alloc.horizon_it = insert_horizon_point(point); @@ -124,14 +128,16 @@ void EasyBackfillingFast::make_decisions(double date, break; } } + } - // Continue traversal, backfilling jobs that does not hinder - // priority job. + // Backfill jobs that does not hinder priority job. + if (_nb_available_machines > 0) + { for (auto job_it = _pending_jobs.begin(); job_it != _pending_jobs.end(); ) { const Job * pending_job = *job_it; - // Does the job can be executed now ? + // Can the job be executed now ? if (pending_job->nb_requested_resources <= _nb_available_machines && date + pending_job->walltime <= _priority_job->completion_time) { @@ -150,6 +156,10 @@ void EasyBackfillingFast::make_decisions(double date, _nb_available_machines -= pending_job->nb_requested_resources; _current_allocations[pending_job->id] = alloc; job_it = _pending_jobs.erase(job_it); + + // Directly get out of the backfilling loop if all machines are busy. + if (_nb_available_machines <= 0) + break; } else { @@ -168,10 +178,12 @@ void EasyBackfillingFast::make_decisions(double date, // Can the job be executed right now? if (new_job->nb_requested_resources <= _nb_available_machines) { + //LOG_F(INFO, "There are enough available resources (%d) to execute job %s", _nb_available_machines, new_job->id.c_str()); // Can it be executed now (without hindering priority job?) if (_priority_job == nullptr || date + new_job->walltime <= _priority_job->completion_time) { + //LOG_F(INFO, "Job %s can be started right away!", new_job->id.c_str()); // Yes, the job can be executed right away! Allocation alloc; @@ -192,6 +204,8 @@ void EasyBackfillingFast::make_decisions(double date, else { // No, the job cannot be executed (hinders priority job.) + /*LOG_F(INFO, "Not enough time to execute job %s (walltime=%g, priority job expected starting time=%g)", + new_job->id.c_str(), (double)new_job->walltime, _priority_job->completion_time);*/ _pending_jobs.push_back(new_job); } } @@ -202,6 +216,8 @@ void EasyBackfillingFast::make_decisions(double date, // Is the job valid on this platform? if (new_job->nb_requested_resources > _nb_machines) { + /*LOG_F(INFO, "Rejecing job %s (required %d machines, while platform size is %d)", + new_job->id.c_str(), new_job->nb_requested_resources, _nb_machines);*/ _decision->add_reject_job(new_job_id, date); } else diff --git a/src/algo/energy_bf.cpp b/src/algo/energy_bf.cpp index ed51dd9ccdec772b7e7e85e90679a68bd8cee4d2..7f3cb7687583b66c78a1e7fec5fd54f7addeb143 100644 --- a/src/algo/energy_bf.cpp +++ b/src/algo/energy_bf.cpp @@ -1343,3 +1343,31 @@ bool EnergyBackfilling::is_fake_job(const std::string & job_id) { return boost::starts_with(job_id, "fakejob_"); } + +bool EnergyBackfilling::contains_any_fake_job(const Schedule &schedule) +{ + for (auto slice_it = schedule.begin(); slice_it != schedule.end(); ++slice_it) + { + for (auto mit : slice_it->allocated_jobs) + { + const Job * job = mit.first; + if (is_fake_job(job->id)) + return true; + } + } + return false; +} + +bool EnergyBackfilling::contains_any_nonfake_job(const Schedule &schedule) +{ + for (auto slice_it = schedule.begin(); slice_it != schedule.end(); ++slice_it) + { + for (auto mit : slice_it->allocated_jobs) + { + const Job * job = mit.first; + if (!is_fake_job(job->id)) + return true; + } + } + return false; +} diff --git a/src/algo/energy_bf.hpp b/src/algo/energy_bf.hpp index b66526b913b8b39343bf964020661dd3d4f07350..cba51a36e7d5a13eb00f023f5d56b498f546d133 100644 --- a/src/algo/energy_bf.hpp +++ b/src/algo/energy_bf.hpp @@ -156,6 +156,9 @@ protected: static bool is_potential_sleep_job(const std::string & job_id); static bool is_fake_job(const std::string & job_id); + static bool contains_any_fake_job(const Schedule & schedule); + static bool contains_any_nonfake_job(const Schedule & schedule); + protected: Schedule _schedule; bool _debug = false; diff --git a/src/algo/energy_bf_monitoring_inertial_shutdown.cpp b/src/algo/energy_bf_monitoring_inertial_shutdown.cpp index da2477ca5037b938303fa01436a1b7f6974fad45..3da4934be937b2fc4f0129044ada05332f18a765 100644 --- a/src/algo/energy_bf_monitoring_inertial_shutdown.cpp +++ b/src/algo/energy_bf_monitoring_inertial_shutdown.cpp @@ -120,26 +120,34 @@ void EnergyBackfillingMonitoringInertialShutdown::make_decisions(double date, "Invalid nb_machines_sedated_for_being_idle value: %d\n", _nb_machines_sedated_for_being_idle); - // Let's remove finished jobs from the schedule - for (const string & ended_job_id : _jobs_ended_recently) + if (!_jobs_ended_recently.empty()) { - const Job * ended_job = (*_workload)[ended_job_id]; - ++_nb_jobs_completed; - - PPK_ASSERT_ERROR(_schedule.contains_job(ended_job), - "Invalid schedule: job '%s' just finished, " - "but it not in the schedule...\n%s", - ended_job_id.c_str(), _schedule.to_string().c_str()); - PPK_ASSERT_ERROR(!_queue->contains_job(ended_job), - "Job '%s' just ended, but it is still in the " - "queue...\nQueue : %s", - ended_job_id.c_str(), - _queue->to_string().c_str()); - - // Let's remove the finished job from the schedule - _schedule.remove_job(ended_job); - } + // Let's remove finished jobs from the schedule + for (const string & ended_job_id : _jobs_ended_recently) + { + const Job * ended_job = (*_workload)[ended_job_id]; + ++_nb_jobs_completed; + PPK_ASSERT_ERROR(_schedule.contains_job(ended_job), + "Invalid schedule: job '%s' just finished, " + "but it not in the schedule...\n%s", + ended_job_id.c_str(), _schedule.to_string().c_str()); + PPK_ASSERT_ERROR(!_queue->contains_job(ended_job), + "Job '%s' just ended, but it is still in the " + "queue...\nQueue : %s", + ended_job_id.c_str(), + _queue->to_string().c_str()); + + // Let's remove the finished job from the schedule + _schedule.remove_job(ended_job); + } + + // Stop sending CALL_ME_LATER if all jobs have been executed. + if (_no_more_static_job_to_submit_received && + _queue->is_empty() && + !EnergyBackfilling::contains_any_nonfake_job(_schedule)) + _stop_sending_call_me_later = true; + } // Let's update the first slice of the schedule update_first_slice_taking_sleep_jobs_into_account(date); diff --git a/src/algo/energy_bf_monitoring_period.cpp b/src/algo/energy_bf_monitoring_period.cpp index 95b077746b3b0d526e24cbacd2c7774f43b38ea8..4da14adf48de917afedb271862ec50197b14cdd1 100644 --- a/src/algo/energy_bf_monitoring_period.cpp +++ b/src/algo/energy_bf_monitoring_period.cpp @@ -65,13 +65,16 @@ void EnergyBackfillingMonitoringPeriod::on_requested_call(double date) // Let's execute on_monitoring_stage on_monitoring_stage(date); - // Let's request a call for the next monitoring stage - _next_monitoring_period_expected_date = date + _period_between_monitoring_stages; - _decision->add_call_me_later((double)(_next_monitoring_period_expected_date), date); - _nb_call_me_later_running++; - - LOG_F(INFO, "EnergyBackfillingMonitoringPeriod: 'Chose to launch a call_me_later at %g", - (double)_next_monitoring_period_expected_date); + if (!_stop_sending_call_me_later) + { + // Let's request a call for the next monitoring stage + _next_monitoring_period_expected_date = date + _period_between_monitoring_stages; + _decision->add_call_me_later((double)(_next_monitoring_period_expected_date), date); + _nb_call_me_later_running++; + + LOG_F(INFO, "EnergyBackfillingMonitoringPeriod: 'Chose to launch a call_me_later at %g", + (double)_next_monitoring_period_expected_date); + } } } diff --git a/src/algo/energy_bf_monitoring_period.hpp b/src/algo/energy_bf_monitoring_period.hpp index 70553d504f25cdcecbe4d1f8c68f4ac58fd0d446..525e27fee3745e1b1afe4fd5b459e2fbef1a54e1 100644 --- a/src/algo/energy_bf_monitoring_period.hpp +++ b/src/algo/energy_bf_monitoring_period.hpp @@ -29,6 +29,7 @@ public: protected: std::string _output_dir; + bool _stop_sending_call_me_later = false; private: bool _monitoring_period_launched = false; diff --git a/src/algo/energy_watcher.cpp b/src/algo/energy_watcher.cpp index b22e4f2c28fec201ffc3b407b70c2c02a9808b42..0905f7899fec453db0cda714e7b36702d92385ae 100644 --- a/src/algo/energy_watcher.cpp +++ b/src/algo/energy_watcher.cpp @@ -51,6 +51,9 @@ void EnergyWatcher::make_decisions(double date, if (_consumed_joules_updated_recently) { + if (_previous_energy < 0) + _previous_energy = _consumed_joules; + PPK_ASSERT_ERROR(_consumed_joules - _previous_energy >= -1e-6, "Energy consumption inconsistency: it should be non-decreasing. " "Received %g but previous value is %g.", diff --git a/src/algo/fcfs.cpp b/src/algo/fcfs.cpp new file mode 100644 index 0000000000000000000000000000000000000000..97a948ed59c1309d7dd13055f76c8b4159697518 --- /dev/null +++ b/src/algo/fcfs.cpp @@ -0,0 +1,135 @@ +#include "fcfs.hpp" +#include <iostream> + +#include "../pempek_assert.hpp" + +FCFS::FCFS(Workload *workload, + SchedulingDecision *decision, Queue *queue, ResourceSelector *selector, + double rjms_delay, rapidjson::Document *variant_options) : + ISchedulingAlgorithm(workload, decision, queue, selector, rjms_delay, + variant_options) +{} + +FCFS::~FCFS() +{} + +void FCFS::on_simulation_start(double date, + const rapidjson::Value &batsim_config) +{ + (void) date; + (void) batsim_config; + + _available_machines.insert(IntervalSet::ClosedInterval(0, _nb_machines - 1)); + _nb_available_machines = _nb_machines; + PPK_ASSERT_ERROR(_available_machines.size() == (unsigned int) _nb_machines); +} + +void FCFS::on_simulation_end(double date) +{ + (void) date; +} + +void FCFS::make_decisions(double date, + SortableJobOrder::UpdateInformation *update_info, + SortableJobOrder::CompareInformation *compare_info) +{ + (void) update_info; + (void) compare_info; + + // This algorithm is a version of FCFS without backfilling. + // It is meant to be fast in the usual case, not to handle corner cases. + // It is not meant to be easily readable or hackable ;). + + // This fast FCFS variant in a few words: + // - only handles the FCFS queue order + // - only handles finite jobs (no switchoff) + // - only handles time as floating-point (-> precision errors). + + bool job_ended = false; + + // Handle newly finished jobs + for (const std::string & ended_job_id : _jobs_ended_recently) + { + job_ended = true; + + Job * finished_job = (*_workload)[ended_job_id]; + + // Update data structures + _available_machines.insert(_current_allocations[ended_job_id]); + _nb_available_machines += finished_job->nb_requested_resources; + _current_allocations.erase(ended_job_id); + } + + // If jobs have finished, execute jobs as long as they fit + if (job_ended) + { + for (auto job_it = _pending_jobs.begin(); + job_it != _pending_jobs.end(); ) + { + Job * pending_job = *job_it; + IntervalSet machines; + + if (_selector->fit(pending_job, _available_machines, machines)) + { + _decision->add_execute_job(pending_job->id, + machines, date); + + // Update data structures + _available_machines -= machines; + _nb_available_machines -= pending_job->nb_requested_resources; + _current_allocations[pending_job->id] = machines; + job_it = _pending_jobs.erase(job_it); + + } + else + { + // The job becomes priority! + // As there is no backfilling, we can simply leave this loop. + break; + } + } + } + + // Handle newly released jobs + for (const std::string & new_job_id : _jobs_released_recently) + { + Job * new_job = (*_workload)[new_job_id]; + + // Is this job valid? + if (new_job->nb_requested_resources > _nb_machines) + { + // Invalid! + _decision->add_reject_job(new_job_id, date); + continue; + } + + // Is there a waiting job? + if (!_pending_jobs.empty()) + { + // Yes. The new job is queued up. + _pending_jobs.push_back(new_job); + } + else + { + // No, the queue is empty. + // Can the new job be executed now? + if (new_job->nb_requested_resources <= _nb_available_machines) + { + // Yes, the job can be executed right away! + IntervalSet machines = _available_machines.left( + new_job->nb_requested_resources); + _decision->add_execute_job(new_job_id, machines, date); + + // Update data structures + _available_machines -= machines; + _nb_available_machines -= new_job->nb_requested_resources; + _current_allocations[new_job_id] = machines; + } + else + { + // No. The job is queued up. + _pending_jobs.push_back(new_job); + } + } + } +} diff --git a/src/algo/fcfs.hpp b/src/algo/fcfs.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e2d1c925a755aec1e8207a786505e539dcd6beaf --- /dev/null +++ b/src/algo/fcfs.hpp @@ -0,0 +1,38 @@ +#pragma once + +#include <unordered_map> +#include <list> + +#include "../isalgorithm.hpp" +#include "../json_workload.hpp" +#include "../locality.hpp" + +class FCFS : public ISchedulingAlgorithm +{ +public: + FCFS(Workload * workload, SchedulingDecision * decision, + Queue * queue, ResourceSelector * selector, + double rjms_delay, + rapidjson::Document * variant_options); + virtual ~FCFS(); + + virtual void on_simulation_start(double date, + const rapidjson::Value & batsim_config); + + virtual void on_simulation_end(double date); + + virtual void make_decisions(double date, + SortableJobOrder::UpdateInformation * update_info, + SortableJobOrder::CompareInformation * compare_info); + +private: + // Machines currently available + IntervalSet _available_machines; + int _nb_available_machines = -1; + + // Pending jobs (queue) + std::list<Job *> _pending_jobs; + + // Allocations of running jobs + std::unordered_map<std::string, IntervalSet> _current_allocations; +}; diff --git a/src/algo/filler.cpp b/src/algo/filler.cpp index 99652d9e0bcf77949e7749b9507554fd6674531d..ecffb7887d7a907ba211760cdbba4ed497f1424c 100644 --- a/src/algo/filler.cpp +++ b/src/algo/filler.cpp @@ -68,13 +68,14 @@ void Filler::make_decisions(double date, // Let's update available machines for (const string & ended_job_id : _jobs_ended_recently) { - int nb_available_before = available_machines.size(); available_machines.insert(current_allocations[ended_job_id]); - int nb_job_resources = ceil((*_workload)[ended_job_id]->nb_requested_resources * fraction_of_machines_to_use); - PPK_ASSERT_ERROR(nb_available_before + nb_job_resources == (int)available_machines.size()); current_allocations.erase(ended_job_id); } + // Handle machine (un)availability from user events + unavailable_machines -= _machines_that_became_available_recently; + unavailable_machines += _machines_that_became_unavailable_recently; + // Let's handle recently released jobs for (const string & new_job_id : _jobs_released_recently) { @@ -94,18 +95,19 @@ void Filler::make_decisions(double date, void Filler::fill(double date) { + IntervalSet usable_machines = available_machines - unavailable_machines; if (_debug) - LOG_F(1, "fill, availableMachines=%s", available_machines.to_string_hyphen().c_str()); + LOG_F(1, "fill, usable_machines=%s", usable_machines.to_string_hyphen().c_str()); - int nb_available = available_machines.size(); - for (auto job_it = _queue->begin(); job_it != _queue->end() && nb_available > 0; ) + int nb_usable = usable_machines.size(); + for (auto job_it = _queue->begin(); job_it != _queue->end() && nb_usable > 0; ) { const Job * job = (*job_it)->job; // If it fits I sits (http://knowyourmeme.com/memes/if-it-fits-i-sits) IntervalSet used_machines; - if (_selector->fit(job, available_machines, used_machines)) + if (_selector->fit(job, usable_machines, used_machines)) { // Fewer machines might be used that those selected by the fitting algorithm int nb_machines_to_allocate = ceil(fraction_of_machines_to_use * job->nb_requested_resources); @@ -127,9 +129,9 @@ void Filler::fill(double date) current_allocations[job->id] = used_machines; + usable_machines.remove(used_machines); available_machines.remove(used_machines); - PPK_ASSERT_ERROR(nb_available - used_machines.size() == available_machines.size()); - nb_available -= used_machines.size(); + nb_usable -= used_machines.size(); if (set_job_metadata) _decision->add_set_job_metadata(job->id, diff --git a/src/algo/filler.hpp b/src/algo/filler.hpp index 2441093ca6950719ab01362318ebf80565c126ac..2aa15d94e50a442f02b220e44a984fda3fb613a4 100644 --- a/src/algo/filler.hpp +++ b/src/algo/filler.hpp @@ -36,7 +36,8 @@ private: bool set_job_metadata = false; //! If set to true, metadata will be associated to jobs when they are started. bool custom_mapping = true; - IntervalSet available_machines; + IntervalSet available_machines; // Corresponds to classical availability: no job is running on those machines. + IntervalSet unavailable_machines; // This is NOT the complement of available_machines! This correspond to user-supplied events, that may overlap strangely with job executions as I write these lines. std::map<std::string, IntervalSet> current_allocations; bool _debug = true; }; diff --git a/src/external/taywee_args.hpp b/src/external/taywee_args.hpp index 655718fe279848ca9d407fc6bda292a4c4f46c6f..62bbfb3bccd9a0dbba2f993cba01b606d1b3d944 100644 --- a/src/external/taywee_args.hpp +++ b/src/external/taywee_args.hpp @@ -1761,7 +1761,7 @@ namespace args { parserCoroutine(coro.Parser()); } - catch (args::SubparserError) + catch (args::SubparserError&) { } #else diff --git a/src/isalgorithm.cpp b/src/isalgorithm.cpp index e1effd6c96d65b5f7b479f0d1df20f1fe963706d..f282844d63aa3bd59e4623ac97bf850f9596d84f 100644 --- a/src/isalgorithm.cpp +++ b/src/isalgorithm.cpp @@ -23,6 +23,8 @@ void ISchedulingAlgorithm::clear_recent_data_structures() _jobs_killed_recently.clear(); _jobs_whose_waiting_time_estimation_has_been_requested_recently.clear(); _machines_whose_pstate_changed_recently.clear(); + _machines_that_became_available_recently.clear(); + _machines_that_became_unavailable_recently.clear(); _nopped_recently = false; _consumed_joules_updated_recently = false; _consumed_joules = -1; @@ -91,6 +93,12 @@ void ISchedulingAlgorithm::on_no_more_static_job_to_submit_received(double date) _no_more_static_job_to_submit_received = true; } +void ISchedulingAlgorithm::on_no_more_external_event_to_occur(double date) +{ + (void) date; + _no_more_external_event_to_occur_received = true; +} + void ISchedulingAlgorithm::on_answer_energy_consumption(double date, double consumed_joules) { (void) date; @@ -98,6 +106,18 @@ void ISchedulingAlgorithm::on_answer_energy_consumption(double date, double cons _consumed_joules_updated_recently = true; } +void ISchedulingAlgorithm::on_machine_available_notify_event(double date, IntervalSet machines) +{ + (void) date; + _machines_that_became_available_recently += machines; +} + +void ISchedulingAlgorithm::on_machine_unavailable_notify_event(double date, IntervalSet machines) +{ + (void) date; + _machines_that_became_unavailable_recently += machines; +} + void ISchedulingAlgorithm::on_query_estimate_waiting_time(double date, const string &job_id) { (void) date; diff --git a/src/isalgorithm.hpp b/src/isalgorithm.hpp index 125707e68127b2e73dbcb06022cb5a66608774b8..ff6a8c308580b4fd1b1e28a88fa70b9baaa80fbb 100644 --- a/src/isalgorithm.hpp +++ b/src/isalgorithm.hpp @@ -84,6 +84,12 @@ public: */ virtual void on_no_more_static_job_to_submit_received(double date); + /** + * @brief This function is called when the on_no_more_external_event_to_occur + * notification is received + */ + virtual void on_no_more_external_event_to_occur(double date); + /** * @brief This function is called when an ANSWER message about energy consumption is received * @param[in] date The date at which the ANSWER message has been received @@ -91,6 +97,20 @@ public: */ virtual void on_answer_energy_consumption(double date, double consumed_joules); + /** + * @brief This function is called when a machine_available NOTIFY event is received. + * @param[in] date The date at which the NOTIFY event has been received. + * @param[in] machines The machines whose availability has changed. + */ + virtual void on_machine_available_notify_event(double date, IntervalSet machines); + + /** + * @brief This function is called when a machine_unavailable NOTIFY event is received. + * @param[in] date The date at which the NOTIFY event has been received. + * @param[in] machines The machines whose availability has changed. + */ + virtual void on_machine_unavailable_notify_event(double date, IntervalSet machines); + /** * @brief This function is called when a QUERY message about estimating waiting time of potential jobs is received. * @param[in] date The date at which the QUERY message has been received @@ -137,6 +157,7 @@ protected: int _nb_machines = -1; RedisStorage * _redis = nullptr; bool _no_more_static_job_to_submit_received = false; + bool _no_more_external_event_to_occur_received = false; protected: std::vector<std::string> _jobs_released_recently; @@ -144,6 +165,8 @@ protected: std::vector<std::string> _jobs_killed_recently; std::vector<std::string> _jobs_whose_waiting_time_estimation_has_been_requested_recently; std::map<int, IntervalSet> _machines_whose_pstate_changed_recently; + IntervalSet _machines_that_became_available_recently; + IntervalSet _machines_that_became_unavailable_recently; bool _nopped_recently; bool _consumed_joules_updated_recently; double _consumed_joules; diff --git a/src/main.cpp b/src/main.cpp index 9a2ac65f79c3d423711c77a2cf9b8267e49448ca..6b99cf72e5f6e19779922057580d4ea1fddadd5c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,6 +33,7 @@ #include "algo/energy_bf_machine_subpart_sleeper.hpp" #include "algo/energy_watcher.hpp" #include "algo/filler.hpp" +#include "algo/fcfs.hpp" #include "algo/fcfs_fast.hpp" #include "algo/killer.hpp" #include "algo/killer2.hpp" @@ -78,7 +79,7 @@ int main(int argc, char ** argv) "energy_bf", "energy_bf_dicho", "energy_bf_idle_sleeper", "energy_bf_monitoring", "energy_bf_monitoring_inertial", "energy_bf_subpart_sleeper", - "energy_watcher", "fcfs_fast", + "energy_watcher", "fcfs", "fcfs_fast", "filler", "killer", "killer2", "random", "rejecter", "sequencer", "sleeper", "submitter", "waiting_time_estimator"}; const set<string> policies_set = {"basic", "contiguous"}; @@ -138,7 +139,7 @@ int main(int argc, char ** argv) % flag_verbosity_level.Get() % verbosity_levels_string)); } - catch(args::Help) + catch(args::Help&) { parser.helpParams.addDefault = true; printf("%s", parser.Help().c_str()); @@ -149,12 +150,12 @@ int main(int argc, char ** argv) printf("%s", e.what()); return 0; } - catch(args::ParseError e) + catch(args::ParseError & e) { printf("%s\n", e.what()); return 1; } - catch(args::ValidationError e) + catch(args::ValidationError & e) { printf("%s\n", e.what()); return 1; @@ -284,6 +285,8 @@ int main(int argc, char ** argv) algo = new EnergyBackfillingMachineSubpartSleeper(&w, &decision, queue, selector, rjms_delay, &json_doc_variant_options); else if (scheduling_variant == "energy_watcher") algo = new EnergyWatcher(&w, &decision, queue, selector, rjms_delay, &json_doc_variant_options); + else if (scheduling_variant == "fcfs") + algo = new FCFS(&w, &decision, queue, selector, rjms_delay, &json_doc_variant_options); else if (scheduling_variant == "fcfs_fast") algo = new FCFSFast(&w, &decision, queue, selector, rjms_delay, &json_doc_variant_options); else if (scheduling_variant == "killer") @@ -502,6 +505,20 @@ void run(Network & n, ISchedulingAlgorithm * algo, SchedulingDecision & d, { algo->on_no_more_static_job_to_submit_received(current_date); } + else if (notify_type == "no_more_external_event_to_occur") + { + algo->on_no_more_external_event_to_occur(current_date); + } + else if (notify_type == "event_machine_available") + { + IntervalSet resources = IntervalSet::from_string_hyphen(event_data["resources"].GetString(), " "); + algo->on_machine_available_notify_event(current_date, resources); + } + else if (notify_type == "event_machine_unavailable") + { + IntervalSet resources = IntervalSet::from_string_hyphen(event_data["resources"].GetString(), " "); + algo->on_machine_unavailable_notify_event(current_date, resources); + } else { throw runtime_error("Unknown NOTIFY type received. Type = " + notify_type); diff --git a/src/schedule.cpp b/src/schedule.cpp index 8f7348e0aa7c0d9ac49470fe3ef70b1134f34f5d..c417394541e9e544c678bc0a75a17161afaba454 100644 --- a/src/schedule.cpp +++ b/src/schedule.cpp @@ -896,7 +896,7 @@ void Schedule::write_svg_to_file(const string &filename) const void Schedule::output_to_svg(const string &filename_prefix) { - const int bufsize = 128; + const int bufsize = 4096; char *buf = new char[bufsize]; snprintf(buf, bufsize, "%s%06d.svg", filename_prefix.c_str(), _output_number); @@ -907,6 +907,96 @@ void Schedule::output_to_svg(const string &filename_prefix) delete[] buf; } +void Schedule::dump_to_batsim_jobs_file(const string &filename) const +{ + ofstream f(filename); + if (f.is_open()) + { + f << "job_id,submission_time,requested_number_of_resources,requested_time,starting_time,finish_time,allocated_resources\n"; + + PPK_ASSERT_ERROR(_profile.size() > 0); + + const int buf_size = 4096; + char *buf = new char[buf_size]; + + map<const Job *, Rational> jobs_starting_times; + set<const Job *> current_jobs; + for (auto mit : _profile.begin()->allocated_jobs) + { + const Job *allocated_job = mit.first; + current_jobs.insert(allocated_job); + jobs_starting_times[allocated_job] = _profile.begin()->begin; + } + + // Let's traverse the profile to find the beginning of each job + for (auto slice_it = _profile.begin(); slice_it != _profile.end(); ++slice_it) + { + const TimeSlice &slice = *slice_it; + set<const Job *> allocated_jobs; + for (auto mit : slice.allocated_jobs) + { + const Job *job = mit.first; + allocated_jobs.insert(job); + } + + set<const Job *> finished_jobs; + set_difference(current_jobs.begin(), current_jobs.end(), allocated_jobs.begin(), allocated_jobs.end(), + std::inserter(finished_jobs, finished_jobs.end())); + + for (const Job *job : finished_jobs) + { + // Find where the job has been allocated + PPK_ASSERT_ERROR(slice_it != _profile.begin()); + auto previous_slice_it = slice_it; + --previous_slice_it; + IntervalSet job_machines = previous_slice_it->allocated_jobs.at(job); + + snprintf(buf, buf_size, "%s,%g,%d,%g,%g,%g,%s\n", + job->id.c_str(), + job->submission_time, + job->nb_requested_resources, + (double)job->walltime, + (double)jobs_starting_times[job], + (double)slice_it->begin, + job_machines.to_string_hyphen(" ", "-").c_str()); + f << buf; + } + + set<const Job *> new_jobs; + set_difference(allocated_jobs.begin(), allocated_jobs.end(), current_jobs.begin(), current_jobs.end(), + std::inserter(new_jobs, new_jobs.end())); + + for (const Job *job : new_jobs) + { + jobs_starting_times[job] = slice.begin; + } + + // Update current_jobs + for (const Job *job : finished_jobs) + current_jobs.erase(job); + for (const Job *job : new_jobs) + current_jobs.insert(job); + } + + delete[] buf; + } + + f.close(); +} + +void Schedule::incremental_dump_as_batsim_jobs_file(const string &filename_prefix) +{ + const int bufsize = 4096; + char *buf = new char[bufsize]; + + snprintf(buf, bufsize, "%s%06d.csv", filename_prefix.c_str(), _output_number); + _output_number = (_output_number + 1) % 10000000; + + dump_to_batsim_jobs_file(buf); + + delete[] buf; +} + int Schedule::nb_machines() const { return _nb_machines; diff --git a/src/schedule.hpp b/src/schedule.hpp index d82e2ff54c5b8764300a4884e2690114c9b9c047..cf90c5f1d1b6672923241792d04c04b05d2fbac3 100644 --- a/src/schedule.hpp +++ b/src/schedule.hpp @@ -101,6 +101,9 @@ public: void write_svg_to_file(const std::string & filename) const; void output_to_svg(const std::string & filename_prefix = "/tmp/schedule"); + void dump_to_batsim_jobs_file(const std::string & filename) const; + void incremental_dump_as_batsim_jobs_file(const std::string & filename_prefix = "/tmp/schedule"); + int nb_machines() const; private: diff --git a/test/Makefile b/test/Makefile deleted file mode 100644 index 9df0ce1916172219329ebbb4faf41dcc80c16233..0000000000000000000000000000000000000000 --- a/test/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -test: - nix-shell ../ci -A test_deps_pinned --command 'cd .. && bash ./ci/run-tests.bash' diff --git a/test/README.md b/test/README.md index 877a581bffda6a8bac4487ecfcd344ed56e1296c..68c1c437c53a0dd6092f05bc00ba27947629cad4 100644 --- a/test/README.md +++ b/test/README.md @@ -1,10 +1,18 @@ ### Running tests +``` bash +nix-shell ../release.nix -A integration_tests --command 'pytest' +# or just pytest, but you must prepare your env... ``` -make test -# or pytest, but you must prepare your env, run redis... + +Optionally, use Batsim's binary cache to avoid recompiling many things (e.g., SimGrid). + +``` bash +nix-env -iA cachix -f https://cachix.org/api/v1/install # installs cachix +cachix use batsim # add Batsim's cachix storage as a Nix remote cache ``` -### How it works? +### How does it work? +0. nix-shell puts you into an environment where batsched, batsim, robin, redis, etc. are available (code in [release.nix]) 1. pytest generates combinations of test input (code in [conftest.py]) 2. for each combination of inputs: (code in [test_runner.py]) 1. pytest generates a [robin] input file @@ -19,11 +27,12 @@ robin test-instances/FAILING-TEST.yaml You can also run batsim and batsched in different terminals: ``` bash -# feel free to hack — e.g., prepend commands with gdb, valgrind... +# feel free to hack these files — e.g., prepend commands with gdb, valgrind... ./test-out/FAILING-TEST/cmd/batsim.bash ./test-out/FAILING-TEST/cmd/sched.bash ``` +[release.nix]: ../release.nix [conftest.py]: ./conftest.py [test_runner.py]: ./test_runner.py [robin]: https://framagit.org/batsim/batexpe diff --git a/test/conftest.py b/test/conftest.py index a7e3527a38460cdda8d28a13a0ec45470cc58bb4..7ffdfe8b8bb504047e010489a0f6821c119fd18b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 -from collections import namedtuple import glob +import pytest +import subprocess +from collections import namedtuple from os.path import abspath, basename Workload = namedtuple('Workload', ['name', 'filename']) @@ -60,3 +62,23 @@ def pytest_generate_tests(metafunc): if 'upper_llh_threshold' in metafunc.fixturenames: metafunc.parametrize('upper_llh_threshold', [60]) + +@pytest.fixture(scope="session", autouse=True) +def manage_redis_server(request): + print('Trying to run a redis-server...') + proc = subprocess.Popen('redis-server', stdout=subprocess.PIPE) + try: + out, _ = proc.communicate(timeout=1) + if 'Address already in use' in str(out): + print("Could not run redis-server (address already in use).") + print("Assuming that the process using the TCP port is another redis-server instance and going on.") + else: + raise Exception("Could not run redis-server (unhandled reason), aborting.") + except subprocess.TimeoutExpired: + print('redis-server has been running for 1 second.') + print('Assuming redis-server has started successfully and going on.') + + def on_finalize(): + print('Killing the spawned redis-server (if any)...') + proc.kill() + request.addfinalizer(on_finalize)