Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 109 additions & 52 deletions .github/workflows/build-hpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,106 +2,143 @@ name: build-hpc

# Controls when the action will run
on:

# Trigger the workflow on all pushes to main, except on tag creation
push:
branches:
- main
- main
tags-ignore:
- '**'

- '**'
# Trigger the workflow on all pull requests
pull_request: ~

# Allow workflow to be dispatched on demand
workflow_dispatch: ~

# Trigger after public PR approved for CI
pull_request_target:
types: [labeled]

jobs:
ci-hpc:
name: ci-hpc
if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
name: ci-hpc - ${{ matrix.name }}
if: ${{ (!github.event.pull_request.head.repo.fork) && (github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci') }}

strategy:
fail-fast: false # false: try to complete all jobs

matrix:
cuda: [False, True]
openmp: [False, True]
name:
- ac-gpu nvhpc

build_type: [RelWithDebInfo]
site: [ac-batch]
name:
- "ac-cpu intel"
- "ac-cpu gnu"
- "ac-gpu nvhpc OpenACC + CUDA"
- "ac-gpu nvhpc OMP offload"
- "ac-gpu nvhpc OpenACC"
- "ac-gpu nvhpc OMP offload + CUDA"

include:
- name: ac-gpu nvhpc
site: ac-batch
sbatch_options: |

# ------------------
# CPU builds
# ------------------
- name: "ac-cpu intel"
compiler: intel-classic
cuda: false
openmp: false
# Define CPU sbatch once, reuse via *sbatch_cpu
sbatch_options: &sbatch_cpu |
#SBATCH --time=00:20:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=32
#SBATCH --hint=nomultithread
#SBATCH --qos=np
envfile: arch/ecmwf/hpc2020/intel/2021.4.0/env.sh

- name: "ac-cpu gnu"
compiler: gnu-14
cuda: false
openmp: false
sbatch_options: *sbatch_cpu
envfile: arch/ecmwf/hpc2020/gnu/14.2.0/env.sh


# ------------------
# GPU builds (nvhpc)
# ------------------
- name: "ac-gpu nvhpc OpenACC + CUDA"
compiler: nvhpc
cuda: true
openmp: false
# Define GPU sbatch once, reuse in the next GPU entries
sbatch_options: &sbatch_gpu |
#SBATCH --time=00:20:00
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=32
#SBATCH --gpus-per-task=1
#SBATCH --mem=100G
#SBATCH --qos=dg
cmake_toolchain: &nvhpc_toolchain -DCMAKE_TOOLCHAIN_FILE=arch/ecmwf/hpc2020/nvhpc/24.5/toolchain.cmake
envfile: &nvhpc_envfile arch/ecmwf/hpc2020/nvhpc/24.5/env.sh

- name: "ac-gpu nvhpc OMP offload"
compiler: nvhpc
cuda: false
openmp: true
sbatch_options: *sbatch_gpu
cmake_toolchain: *nvhpc_toolchain
envfile: *nvhpc_envfile

- name: "ac-gpu nvhpc OMP offload + CUDA"
compiler: nvhpc
cuda: true
openmp: true
sbatch_options: *sbatch_gpu
cmake_toolchain: *nvhpc_toolchain
envfile: *nvhpc_envfile

- name: "ac-gpu nvhpc OpenACC"
compiler: nvhpc
cuda: false
openmp: false
sbatch_options: *sbatch_gpu
cmake_toolchain: *nvhpc_toolchain
envfile: *nvhpc_envfile

runs-on: [self-hosted, linux, hpc]

env:
GH_TOKEN: ${{ github.token }}

steps:
- uses: ecmwf-actions/reusable-workflows/ci-hpc-generic@v2
with:
site: ${{ matrix.site }}
troika_user: ${{ secrets.HPC_CI_SSH_USER }}
sbatch_options: ${{ matrix.sbatch_options }}
template_data: |
modules:
- cmake
- ninja
- ecbuild
- prgenv/nvidia
- nvidia/24.5
- python3
cmake_options:
- -DENABLE_OMP_OFFLOAD=${{ matrix.openmp }}
- -DENABLE_CUDA=${{ matrix.cuda }}
- -DENABLE_IO_PARALLEL=ON
- -DENABLE_MPI=ON
- -DENABLE_SINGLE_PRECISION=ON
- -DENABLE_DOUBLE_PRECISION=ON
- -DCMAKE_TOOLCHAIN_FILE=arch/ecmwf/hpc2020/nvhpc/24.5/toolchain.cmake
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}${{ matrix.cmake_toolchain || '' }}
dependencies:
ecmwf/fckit:
version: 0.13.0
version: 0.14.0
cmake_options:
- -DENABLE_TESTS=OFF
- -DENABLE_FCKIT_VENV=ON
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
ecmwf-ifs/fiat:
version: 1.4.1
version: 1.6.1
cmake_options:
- -DENABLE_TESTS=OFF
- -DENABLE_SINGLE_PRECISION=ON
- -DENABLE_DOUBLE_PRECISION=ON
- -DENABLE_MPI=ON
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
template: |
{% for module in modules %}
module load {{module}}
{% endfor %}
BASEDIR=$PWD
{% for name, options in dependencies.items() %}
mkdir -p {{name}}
pushd {{name}}
git init
git remote add origin ${{ github.server_url }}/{{name}}
git fetch origin {{options['version']}}
git reset --hard FETCH_HEAD
cmake -G Ninja -S . -B build \
{% for name in dependencies %}
{% set org, proj = name.split('/') %}
-D{{proj}}_ROOT=$BASEDIR/{{name}}/installation \
{% endfor %}
{{ options['cmake_options']|join(' ') }}
cmake --build build
cmake --install build --prefix installation
popd
{% endfor %}
REPO=${{ github.event.pull_request.head.repo.full_name || github.repository }}
SHA=${{ github.event.pull_request.head.sha || github.sha }}
mkdir -p $REPO
Expand All @@ -110,12 +147,32 @@ jobs:
git remote add origin ${{ github.server_url }}/$REPO
git fetch origin $SHA
git reset --hard FETCH_HEAD
source ${{ matrix.envfile }}
popd
cmake -G Ninja -S $REPO -B build \
BASEDIR=$PWD
{% for name, options in dependencies.items() %}
mkdir -p {{ name }}
pushd {{ name }}
git init
git remote add origin ${{ github.server_url }}/{{ name }}
git fetch origin {{ options['version'] }}
git reset --hard FETCH_HEAD
cmake -G Ninja -S . -B build \
{% for name in dependencies %}
{% set org, proj = name.split('/') %}
-D{{proj}}_ROOT=$BASEDIR/{{name}}/installation \
{% endfor %}
{{ options['cmake_options']|join(' ') }}

cmake --build build
cmake --install build --prefix installation
popd
{% endfor %}
cmake -G "Unix Makefiles" -S $REPO -B build \
{% for name in dependencies %}
{% set org, proj = name.split('/') %}
-D{{proj}}_ROOT=$BASEDIR/{{name}}/installation \
{% endfor %}
{{ cmake_options|join(' ') }}
cmake --build build
ctest --test-dir build
ctest -VV --test-dir build --output-on-failure
39 changes: 39 additions & 0 deletions arch/ecmwf/hpc2020/gnu/14.2.0/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# (C) Copyright 1988- ECMWF.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks a lot for adding the env files, these are super handy 👌 I think the toolchain files are overkill though and unnecessary. Up until now, the default ecbuild flags, plus whatever is set inside FIELD_API's compiler flags macro, has been enough to get correct behaviour on Intel and GNU. So which of these flags or ecbuild options are necessary?

If the tests pass without the toolchain files, I would be happier if both the Intel and GNU toolchains were to be removed and only the env files were retained.

#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

# Store tracing and disable (module is *way* too verbose)
{ tracing_=${-//[^x]/}; set +x; } 2>/dev/null

module_load() {
echo "+ module load $1"
module load $1
}
module_unload() {
echo "+ module unload $1"
module unload $1
}

# Unload all modules to be certain
module purge

# Load modules
module_load prgenv/gnu
module_load gcc/14.2.0
module_load hpcx-openmpi/2.18.1
#module_load boost/1.87.0
module_load hdf5/1.14.6
module_load cmake/4.0.2
module_load python3/3.11.10-01
#module_load java/11.0.6
module_load ninja/1.12.1
module_load ecbuild/3.12.0


# Restore tracing to stored setting
{ if [[ -n "$tracing_" ]]; then set -x; else set +x; fi } 2>/dev/null

46 changes: 46 additions & 0 deletions arch/ecmwf/hpc2020/intel/2021.4.0/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# (C) Copyright 1988- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

# Source me to get the correct configure/build/run environment

# Store tracing and disable (module is *way* too verbose)
{ tracing_=${-//[^x]/}; set +x; } 2>/dev/null

module_load() {
echo "+ module load $1"
module load $1
}
module_unload() {
echo "+ module unload $1"
module unload $1
}

# Unload all modules to be certain
module_unload intel
module_unload openmpi
module_unload hpcx-openmpi
module_unload hdf5
module_unload cmake
module_unload python3

# Load modules
module_load prgenv/intel
module_load intel/2021.4.0
module_load hpcx-openmpi/2.18.1
module_load hdf5/1.12.2
module_load cmake/4.0.2
module_load python3/3.11.10-01
module_load ecbuild/3.12.0
module_load ninja/1.12.1


set -x

# Restore tracing to stored setting
{ if [[ -n "$tracing_" ]]; then set -x; else set +x; fi } 2>/dev/null

12 changes: 9 additions & 3 deletions arch/ecmwf/hpc2020/nvhpc/24.5/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,14 @@ module_purge
# Load modules
module_load prgenv/nvidia
module_load nvidia/24.5
module_load python3/3.10.10-01
module_load cmake/3.25.2
module_load ecbuild/3.8.5
module_load python3/3.11.10-01
module_load cmake/4.0.2
module_load ecbuild/3.12.0
module load ninja/1.12.1
module_load hdf5/1.14.3
module_load hpcx-openmpi/2.19.0-cuda



# Increase stack size to maximum
ulimit -S -s unlimited
Expand All @@ -45,3 +50,4 @@ ulimit -S -l unlimited
path=$BASH_SOURCE
DIR_PATH=$(dirname $path)
export ECBUILD_TOOLCHAIN=$DIR_PATH/toolchain.cmake

Loading