diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a4b8080..14caee2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,8 +19,7 @@ jobs: finalize: timeout-minutes: 10 needs: - - unit-tests - - example-pull-gcr + - test-htc # Important: the next line MUST be `if: always()`. # Do not change that line. # That line is necessary to make sure that this job runs even if tests fail. @@ -28,46 +27,13 @@ jobs: runs-on: ubuntu-latest steps: - run: | - echo unit-tests: ${{ needs.unit-tests.result }} - echo example-pull-gcr: ${{ needs.example-pull-gcr.result }} + echo test-htc: ${{ needs.test-htc.result }} - run: exit 1 # The last line must NOT end with || # All other lines MUST end with || if: | - (needs.unit-tests.result != 'success') || - (needs.example-pull-gcr.result != 'success') - unit-tests: - runs-on: ubuntu-latest - timeout-minutes: 20 - strategy: - fail-fast: false - matrix: - version: - - '1.2' # minimum Julia version supported in Project.toml - - '1.6' # previous LTS - - '1.10' # current LTS - - '1' # automatically expands to the latest stable 1.x release of Julia - steps: - - uses: actions/checkout@v4 - with: - persist-credentials: false - - uses: julia-actions/setup-julia@v2 - with: - version: ${{ matrix.version }} - - uses: julia-actions/julia-runtest@v1 - - run: find . -type f -name '*.cov' - # - run: find . -type f -name '*.c ov' -exec cat {} \; - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 - with: - files: lcov.info - token: ${{ secrets.CODECOV_TOKEN }} - # If this PR is from a fork, then do NOT fail CI if the Codecov upload errors. - # If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors. - # If this is not a PR, then DO fail CI if the Codecov upload errors. - fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }} - test-slurm: - if: false + (needs.test-htc.result != 'success') + test-htc: runs-on: ubuntu-latest timeout-minutes: 20 strategy: @@ -76,82 +42,47 @@ jobs: version: # Please note: You must specify the full Julia version number (major.minor.patch). # This is because the value here will be directly interpolated into a download URL. - # - '1.2.0' # minimum Julia version supported in Project.toml + - '1.0.5' # minimum Julia version supported in Project.toml - '1.6.7' # previous LTS - '1.10.7' # current LTS - '1.11.2' # currently the latest stable release steps: - - uses: actions/checkout@v4 - with: - persist-credentials: false - - name: Print Docker version - run: | - docker --version - docker version - # This next bit of code is taken from: - # https://github.com/kleinhenz/SlurmClusterManager.jl - # Original author: Joseph Kleinhenz - # License: MIT - - name: Setup Slurm inside Docker - run: | - docker version - docker compose version - docker build --build-arg "JULIA_VERSION=${MATRIX_JULIA_VERSION:?}" -t slurm-cluster-julia -f ci/Dockerfile . - docker compose -f ci/docker-compose.yml up -d - docker ps - env: - MATRIX_JULIA_VERSION: ${{matrix.version}} - - name: Print some information for debugging purposes - run: | - docker exec -t slurmctld pwd - docker exec -t slurmctld ls -la - docker exec -t slurmctld ls -la HTCondorClusterManager - - name: Instantiate package - run: docker exec -t slurmctld julia --project=HTCondorClusterManager -e 'import Pkg; @show Base.active_project(); Pkg.instantiate(); Pkg.status()' - - name: Run tests without a Slurm allocation - run: docker exec -t slurmctld julia --project=HTCondorClusterManager -e 'import Pkg; Pkg.test(; test_args=["slurm"])' - - name: Run tests inside salloc - run: docker exec -t slurmctld salloc -t 00:10:00 -n 2 julia --project=HTCondorClusterManager -e 'import Pkg; Pkg.test(; test_args=["slurm"], coverage=true)' - - name: Run tests inside sbatch - run: docker exec -t slurmctld HTCondorClusterManager/ci/run_my_sbatch.sh - - run: find . -type f -name '*.cov' - - name: Copy .cov files out of the Docker container - run: docker exec slurmctld /bin/bash -c 'cd /home/docker/HTCondorClusterManager && tar -cf - src/*.cov' | tar -xvf - - - run: find . -type f -name '*.cov' - # - run: find . -type f -name '*.cov' -exec cat {} \; - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v5 - with: - files: lcov.info - token: ${{ secrets.CODECOV_TOKEN }} - # If this PR is from a fork, then do NOT fail CI if the Codecov upload errors. - # If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors. - # If this is not a PR, then DO fail CI if the Codecov upload errors. - fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }} - example-pull-gcr: - runs-on: ubuntu-latest - timeout-minutes: 20 - steps: - - uses: actions/checkout@v4 - with: - persist-credentials: false - - name: Print Docker version - run: | - docker --version - docker version - # - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 - # if: false - # with: - # registry: ghcr.io - # username: ${{ github.actor }} - # password: ${{ secrets.GITHUB_TOKEN }} - - name: Docker pull - run: | - docker pull "ghcr.io/${MY_GCR_NAMESPACE:?}/${MY_GCR_IMAGENAME:?}@${MY_DIGEST_HTCONDOR_EXECUTE:?}" - env: - # We intentionally pull by full-length digest (NOT tag) for reproducibility: - MY_DIGEST_HTCONDOR_EXECUTE: 'sha256:d4384c19cdb2f26bae15b1feef3a12bde66554658562df2626c03ae870003555' # htcondor-execute - - name: List images - run: | - docker ps -a - docker images + - uses: actions/checkout@v4 + with: + persist-credentials: false + - uses: julia-actions/setup-julia@v2 + with: + version: ${{ matrix.version }} + - run: docker version + - run: docker compose version + - run: docker compose pull + working-directory: ci/htcondor + - run: docker compose build --build-arg JULIA_VERSION="${MATRIX_VERSION:?}" + working-directory: ci/htcondor + env: + MATRIX_VERSION: ${{matrix.version}} + - run: ./start-htcondor.sh + working-directory: ci/htcondor + - run: docker compose exec -T --user submituser submit condor_submit --help + working-directory: ci/htcondor + - run: docker compose exec -T --user submituser submit julia --version + working-directory: ci/htcondor + - run: docker compose exec -T --user submituser submit julia --project=/SlurmClusterManager -e 'import Pkg; Pkg.test()' + working-directory: ci/htcondor + - run: find . -type f -name '*.cov' + - name: Copy .cov files out of the Docker container + run: | + # docker compose exec -T --user submituser submit /bin/bash -c 'cd ~/HTCondorClusterManager && tar -cf - src/*.cov' | tar -xvf - + docker compose exec -T --user submituser execute1 /bin/bash -c 'cd ~/HTCondorClusterManager && tar -cf - src/*.cov' | tar -xvf - + # docker compose exec -T --user submituser execute2 /bin/bash -c 'cd ~/HTCondorClusterManager && tar -cf - src/*.cov' | tar -xvf - + - run: find . -type f -name '*.cov' + # - run: find . -type f -name '*.cov' -exec cat {} \; + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v5 + with: + files: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} + # If this PR is from a fork, then do NOT fail CI if the Codecov upload errors. + # If this PR is NOT from a fork, then DO fail CI if the Codecov upload errors. + # If this is not a PR, then DO fail CI if the Codecov upload errors. + fail_ci_if_error: ${{ github.event_name != 'pull_request' || github.repository == github.event.pull_request.head.repo.full_name }} diff --git a/ci/Dockerfile b/ci/Dockerfile deleted file mode 100644 index 4f7cc33..0000000 --- a/ci/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# This file is taken from: -# https://github.com/kleinhenz/SlurmClusterManager.jl -# Original author: Joseph Kleinhenz -# License: MIT - -FROM jkleinh/slurm-cluster@sha256:afd20dafc831b0fa781460dc871232579ccf1b54955e434531394c331ce388e4 as base -MAINTAINER Joseph Kleinhenz - -ARG JULIA_VERSION=1.6.0 - -RUN mkdir -p /home/docker/.local/opt/julia \ - && cd /home/docker/.local/opt/julia \ - && folder="$(echo ${JULIA_VERSION} | cut -d. -f1-2)" \ - && curl -L https://julialang-s3.julialang.org/bin/linux/x64/${folder}/julia-${JULIA_VERSION}-linux-x86_64.tar.gz | tar xz --strip 1 \ - && /home/docker/.local/opt/julia/bin/julia --version - -ENV PATH="/home/docker/.local/opt/julia/bin:${PATH}" - -COPY --chown=docker . ClusterManagers - -CMD /bin/bash -l diff --git a/ci/docker-compose.yml b/ci/docker-compose.yml deleted file mode 100644 index 86b1df3..0000000 --- a/ci/docker-compose.yml +++ /dev/null @@ -1,48 +0,0 @@ -# This file is taken from: -# https://github.com/kleinhenz/SlurmClusterManager.jl -# Original author: Joseph Kleinhenz -# License: MIT - -version: "3.3" - -services: - slurmctld: - image: slurm-cluster-julia - command: ["slurmctld"] - container_name: slurmctld - hostname: slurmctld - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6817" - - c1: - image: slurm-cluster-julia - command: ["slurmd"] - hostname: c1 - container_name: c1 - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6818" - depends_on: - - "slurmctld" - - c2: - image: slurm-cluster-julia - command: ["slurmd"] - hostname: c2 - container_name: c2 - volumes: - - slurm_jobdir:/home/docker - - var_log_slurm:/var/log/slurm - expose: - - "6818" - depends_on: - - "slurmctld" - -volumes: - slurm_jobdir: - var_log_slurm: diff --git a/ci/htcondor/Dockerfile b/ci/htcondor/Dockerfile new file mode 100644 index 0000000..25f84c3 --- /dev/null +++ b/ci/htcondor/Dockerfile @@ -0,0 +1,43 @@ +# We intentionally use full-length digests (NOT tags) for reproducibility. +# FROM ghcr.io/juliaparallel/dask-jobqueue-ci-images:htcondor-submit as submit +FROM ghcr.io/juliaparallel/dask-jobqueue-ci-images@sha256:5ada6445b5d8b53b6693ab86be364dd1ce385ada8e53763731ba50d145f0350d as submit + +# We intentionally have no default value: +ARG JULIA_VERSION + +RUN mkdir -p /home/docker/.local/opt/julia \ + && cd /home/docker/.local/opt/julia \ + && folder="$(echo 1.11.2 | cut -d. -f1-2)" \ + && curl -L https://julialang-s3.julialang.org/bin/linux/x64/1.11/julia-1.11.2-linux-x86_64.tar.gz | tar xz --strip 1 \ + && /home/docker/.local/opt/julia/bin/julia --version + +ENV PATH="/home/docker/.local/opt/julia/bin:${PATH}" + +RUN bash -c "pwd" + +COPY --chown=docker . SlurmClusterManager +RUN chmod -R u+rw,g+rw,o+rw SlurmClusterManager + +SHELL ["conda", "run", "-n", "dask-jobqueue", "/bin/bash", "-c"] + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# We intentionally use full-length digests (NOT tags) for reproducibility. +# FROM ghcr.io/juliaparallel/dask-jobqueue-ci-images:htcondor-execute as execute +FROM ghcr.io/juliaparallel/dask-jobqueue-ci-images@sha256:5723d0380f627779bc54a31ebac9a77f0937189453f597845411257dea6ac0db as execute + +# We intentionally have no default value: +ARG JULIA_VERSION + +RUN mkdir -p /home/docker/.local/opt/julia \ + && cd /home/docker/.local/opt/julia \ + && folder="$(echo 1.11.2 | cut -d. -f1-2)" \ + && curl -L https://julialang-s3.julialang.org/bin/linux/x64/1.11/julia-1.11.2-linux-x86_64.tar.gz | tar xz --strip 1 \ + && /home/docker/.local/opt/julia/bin/julia --version + +ENV PATH="/home/docker/.local/opt/julia/bin:${PATH}" + +RUN bash -c "pwd" + +COPY --chown=docker . SlurmClusterManager +RUN chmod -R u+rw,g+rw,o+rw SlurmClusterManager diff --git a/ci/htcondor/docker-compose.yml b/ci/htcondor/docker-compose.yml new file mode 100644 index 0000000..5388f4d --- /dev/null +++ b/ci/htcondor/docker-compose.yml @@ -0,0 +1,77 @@ +# This file is based on: +# https://github.com/dask/dask-jobqueue/blob/main/ci/htcondor/docker-compose.yml +# License: BSD 3-Clause + +# version: "3.4" + +services: + cm: + # We intentionally use full-length digests (NOT tags) for reproducibility. + # + # TODO: mirror this images in our own GCR, instead of needing it to exist in Docker Hub. + # image: htcondor/cm:el7 + image: htcondor/cm@sha256:71cfed5ffc1dc78cb725f571e6be6acdb50ca0322c9cc9cd500a965be7e402c6 + hostname: cm.htcondor + environment: + - USE_POOL_PASSWORD=yes + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + command: bash -c 'condor_store_cred -p password -f /root/secrets/pool_password ; exec bash -x /start.sh' + + submit: + # image: ghcr.io/juliaparallel/dask-jobqueue-ci-images@sha256:5ada6445b5d8b53b6693ab86be364dd1ce385ada8e53763731ba50d145f0350d + build: + context: ../.. + dockerfile: ci/htcondor/Dockerfile + target: submit + hostname: submit.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + - CI_SHARED_SPACE=/shared_space + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ../..:/dask-jobqueue + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + + execute1: + # image: ghcr.io/juliaparallel/dask-jobqueue-ci-images@sha256:5723d0380f627779bc54a31ebac9a77f0937189453f597845411257dea6ac0db + build: + context: ../.. + dockerfile: ci/htcondor/Dockerfile + target: execute + hostname: execute1.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + + execute2: + # image: ghcr.io/juliaparallel/dask-jobqueue-ci-images@sha256:5723d0380f627779bc54a31ebac9a77f0937189453f597845411257dea6ac0db + build: + context: ../.. + dockerfile: ci/htcondor/Dockerfile + target: execute + hostname: execute2.htcondor + environment: + - CONDOR_HOST=cm + - USE_POOL_PASSWORD=yes + depends_on: + - cm + volumes: + - secrets:/root/secrets + - ./condor_config.local:/etc/condor/condor_config.local + - shared_space:/shared_space + +volumes: + secrets: + shared_space: diff --git a/ci/htcondor/start-htcondor.sh b/ci/htcondor/start-htcondor.sh new file mode 100755 index 0000000..50e4b3a --- /dev/null +++ b/ci/htcondor/start-htcondor.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# This file is based on: +# https://github.com/dask/dask-jobqueue/blob/main/ci/htcondor/docker-compose.yml +# License: BSD 3-Clause + +docker compose up -d --no-build + +while [ `docker compose exec -T submit condor_status -af activity|grep Idle|wc -l` -ne 2 ] + do + echo "Waiting for cluster to become ready"; + sleep 2 + done +echo "HTCondor properly configured" diff --git a/ci/my_sbatch.sh b/ci/my_sbatch.sh deleted file mode 100644 index 33d98a8..0000000 --- a/ci/my_sbatch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# Slurm options: -#SBATCH --ntasks=2 -#SBATCH --time=00:10:00 - -# Important note: -# There should be no non-comment non-whitespace lines above this line. - -set -euf -o pipefail - -set -x - -julia --project=ClusterManagers -e 'import Pkg; Pkg.test(; test_args=["slurm"])' diff --git a/ci/run_my_sbatch.sh b/ci/run_my_sbatch.sh deleted file mode 100755 index 509a18d..0000000 --- a/ci/run_my_sbatch.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euf -o pipefail - -set -x - -rm -fv "${HOME:?}/my_stdout.txt" -rm -fv "${HOME:?}/my_stderr.txt" - -sbatch --wait --output="${HOME:?}/my_stdout.txt" --error="${HOME:?}/my_stderr.txt" ./ClusterManagers/ci/my_sbatch.sh - -sleep 5 -cat "${HOME:?}/my_stdout.txt" -cat "${HOME:?}/my_stderr.txt" diff --git a/src/condor.jl b/src/condor.jl index fa4a4d4..8cf9825 100644 --- a/src/condor.jl +++ b/src/condor.jl @@ -51,6 +51,19 @@ function condor_script(portnum::Integer, np::Integer, params::Dict) "$tdir/$jobname.sub" end +function _my_wait_without_timeout(f::Function; timeout_seconds) + each_sleep_duration = 5 + for i = 1:each_sleep_duration:timeout_seconds + sleep(each_sleep_duration) + result = f() + if result + return nothing + end + end + msg = "Timeout ($(timeout_seconds) seconds) exceeded" + error(msg) +end + function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Condition) let mgr_desc = "HTCondor" @@ -68,9 +81,22 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond script = condor_script(portnum, np, params) cmd = `condor_submit $script` - if !success(cmd) - println("batch queue not available (could not run condor_submit)") - return + pipeline = Base.pipeline(ignorestatus(cmd); stdout=Base.stdout, stderr=Base.stderr) + proc = run(pipeline; wait = false) + _my_wait_without_timeout(; timeout_seconds = 5 * 60) do + @info "condor_q:" + run(`condor_q`) + @info "condor_status:" + run(`condor_status`) + Base.process_exited(proc) + end + if !Base.process_exited(proc) + @error "batch queue not available (could not run condor_submit)" Base.process_exited(proc) + return nothing + end + if !success(proc) + @error "batch queue not available (could not run condor_submit)" Base.process_exited(proc) success(proc) + return nothing end print("Waiting for $np workers: ") @@ -86,9 +112,12 @@ function launch(manager::HTCManager, params::Dict, instances_arr::Array, c::Cond end println(".") - catch e + catch ex + bt = catch_backtrace() + @error "Error launching HTCondor" exception=(ex,bt) + # @error "Error launching HTCondor" exception=ex println("Error launching condor") - println(e) + println(ex) end end diff --git a/test/runtests.jl b/test/runtests.jl index 6f55139..536a0f0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,8 +13,5 @@ using Test: @testset, @test, @test_skip using HTCondorClusterManager: addprocs_htc, HTCManager @testset "HTCondorClusterManager.jl" begin - include("elastic.jl") - include("condor.jl") - end # @testset