Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/submit_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def main():
# Easybuild default paths
# start using environment from local machine, job scripts get custom paths
ebconf = {
'accept-eula-for': 'Intel-oneAPI,CUDA,cuDNN',
'accept-eula-for': 'Intel-oneAPI,CUDA,cuDNN,NVHPC',
'buildpath': os.path.join(job['tmp'], 'eb-submit-build-fetch'),
'hooks': hooks_hydra.__file__,
'include-easyblocks': os.path.join(VSCSOFTSTACK_ROOT, EASYBLOCK_REPO),
Expand Down
130 changes: 90 additions & 40 deletions src/build_tools/hooks_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,26 @@
LOCAL_ARCH_SUFFIX = os.getenv('VSC_ARCH_SUFFIX')
LOCAL_ARCH_FULL = f'{LOCAL_ARCH}{LOCAL_ARCH_SUFFIX}'

VALID_TCGENS = ['2024a']
VALID_MODULES_SUBDIRS = VALID_TCGENS + ['system']
VALID_TCS = ['foss', 'intel', 'gomkl', 'gimkl', 'gimpi']
VALID_TOOLCHAINS = {
'2024a': {
'toolchains': ['foss', 'intel', 'gomkl', 'gimkl', 'gimpi'],
'subdir': '2024a',
},
'25.1': {
'toolchains': ['nvidia-compilers', 'NVHPC'],
'subdir': '2024a',
},
}
VALID_MODULES_SUBDIRS = ['system', '2024a']

SUBDIR_MODULES_BWRAP = '.modules_bwrap'
SUFFIX_MODULES_PATH = 'collection'
SUFFIX_MODULES_SYMLINK = 'all'

##################
# MODULE FOOTERS #
##################

INTEL_MPI_MOD_FOOTER = """
if ( os.getenv("SLURM_JOB_ID") ) then
setenv("I_MPI_HYDRA_BOOTSTRAP", "slurm")
Expand All @@ -89,6 +101,17 @@
setenv("JAVA_TOOL_OPTIONS", "-Xmx" .. math.floor(mem*0.8))
end
"""
GPU_DUMMY_MOD_FOOTER = """
if mode() == "load" and not os.getenv("BUILD_TOOLS_LOAD_DUMMY_MODULES") then
LmodError([[
This module is only available on nodes with a GPU.
Jobs can request GPUs with the command 'srun --gpus-per-node=1' or 'sbatch --gpus-per-node=1'.

More information in the VUB-HPC docs:
https://hpc.vub.be/docs/job-submission/gpu-job-types/#gpu-jobs
]])
end
"""


def get_group(name, version):
Expand Down Expand Up @@ -118,22 +141,26 @@ def get_tc_versions():
update_build_option('hooks', None)

tc_versions = {}
for toolcgen in VALID_TCGENS:
tc_versions[toolcgen] = []
for toolc in VALID_TCS:
for tcgen, tcgen_spec in VALID_TOOLCHAINS.items():
tcgen_versions = []
for tc_name in tcgen_spec['toolchains']:
try:
tc_versions[toolcgen].extend(get_toolchain_hierarchy({'name': toolc, 'version': toolcgen}))
tcgen_versions.extend(get_toolchain_hierarchy({'name': tc_name, 'version': tcgen}))
except EasyBuildError:
# skip if no easyconfig found for toolchain-version
pass
tc_versions[tcgen] = {
'toolchains': tcgen_versions,
'subdir': tcgen_spec['subdir'],
}

update_build_option('hooks', hooks)
return tc_versions


def calc_tc_gen(name, version, tcname, tcversion, easyblock):
def calc_tc_gen_subdir(name, version, tcname, tcversion, easyblock):
"""
calculate the toolchain generation
calculate the toolchain generation subdir
return False if not valid
"""
name_version = {'name': name, 'version': version}
Expand All @@ -143,10 +170,11 @@ def calc_tc_gen(name, version, tcname, tcversion, easyblock):
tc_versions = get_tc_versions()

# (software with) valid (sub)toolchain-version combination
for toolcgen in VALID_TCGENS:
if toolchain in tc_versions[toolcgen] or name_version in tc_versions[toolcgen]:
log_msg = f"Determined toolchain generation {toolcgen} for {software}"
return toolcgen, log_msg
for tcgen, tcgen_spec in tc_versions.items():
if toolchain in tcgen_spec['toolchains'] or name_version in tcgen_spec['toolchains']:
tcgen_subdir = tcgen_spec['subdir']
log_msg = f"Determined toolchain generation subdir '{tcgen_subdir}' for {software}"
return tcgen_subdir, log_msg

# invalid toolchains
# all toolchains have 'system' toolchain, so we need to handle the invalid toolchains separately
Expand All @@ -157,17 +185,31 @@ def calc_tc_gen(name, version, tcname, tcversion, easyblock):

# software with 'system' toolchain: return 'system'
if tcname == 'system':
log_msg = f"Determined toolchain {tcname} for {software}"
return tcname, log_msg
tcgen_subdir = 'system'
log_msg = f"Determined toolchain '{tcgen_subdir}' for {software}"
return tcgen_subdir, log_msg

log_msg = f"Invalid toolchain {tcname} and/or toolchain version {tcversion} for {software}"
return False, log_msg


def is_gpu_software(ec):
"determine if it is a GPU-only installation"
gpu_components = ['CUDA']
gpu_toolchains = ['nvidia-compilers', 'NVHPC']

is_gpu_package = ec.name in gpu_components or ec.name in gpu_toolchains
needs_gpu_toolchain = ec.toolchain.name in gpu_toolchains
needs_gpu_component = any([x in ec['versionsuffix'] for x in gpu_components])

return is_gpu_package or needs_gpu_toolchain or needs_gpu_component


def update_moduleclass(ec):
"update the moduleclass of an easyconfig to <tc_gen>/all"
tc_gen, log_msg = calc_tc_gen(
ec.name, ec.version, ec.toolchain.name, ec.toolchain.version, ec.easyblock)
tc_gen, log_msg = calc_tc_gen_subdir(
ec.name, ec.version, ec.toolchain.name, ec.toolchain.version, ec.easyblock
)

if not tc_gen:
raise EasyBuildError("[parse hook] " + log_msg)
Expand Down Expand Up @@ -276,6 +318,14 @@ def parse_hook(ec, *args, **kwargs): # pylint: disable=unused-argument
ec['dependencies'] = [d for d in ec['dependencies'] if 'libfabric' not in d]
ec.log.info("[parse hook] Removed libfabric from dependency list")

if ec.name == 'NVHPC':
# NVHPC ships with OpenMPI v4 which has an issue between its hwloc
# and Slurm cgroups2 that results in mpirun trying to use unallocated
# cores to the job (see https://github.com/open-mpi/ompi/issues/12470)
# Only mpirun is affected, workaround is to set '--bind-to=none':
ec.log.info("[parse hook] Disable mpirun process binding in NVHPC")
ec['modextravars'].update({'OMPI_MCA_hwloc_base_binding_policy': 'none'})

if ec.name == 'Gurobi':
# use centrally installed Gurobi license file, and don't copy to installdir
ec['license_file'] = '/apps/brussel/licenses/gurobi/gurobi.lic'
Expand Down Expand Up @@ -310,17 +360,27 @@ def parse_hook(ec, *args, **kwargs): # pylint: disable=unused-argument
ec.toolchain.options['optarch'] = optarchs_intel[LOCAL_ARCH]
ec.log.info(f"[parse hook] Set optarch in parameter toolchainopts: {ec.toolchain.options['optarch']}")

# skip installation of CUDA software in non-GPU architectures, only create module file
is_cuda_software = 'CUDA' in ec.name or 'CUDA' in ec['versionsuffix']
if is_cuda_software and LOCAL_ARCH_FULL not in GPU_ARCHS:
###############################
# ------ GPU MODULES -------- #
###############################

# skip installation of CUDA software in non-GPU architectures, only create a dummy module file
if is_gpu_software(ec) and LOCAL_ARCH_FULL not in GPU_ARCHS:
ec.log.info("[parse hook] Generating dummy GPU module on non-GPU node")
# inject error message in module file
ec['modluafooter'] = GPU_DUMMY_MOD_FOOTER
# workaround for NVHPC
if ec.name == 'NVHPC':
ec['default_cuda_version'] = '0'
# module_only steps: [MODULE_STEP, PREPARE_STEP, READY_STEP, POSTITER_STEP, SANITYCHECK_STEP]
ec['module_only'] = True
ec.log.info(f"[parse hook] Set parameter module_only: {ec['module_only']}")
ec['skipsteps'] = [SANITYCHECK_STEP]
ec.log.info(f"[parse hook] Set parameter skipsteps: {ec['skipsteps']}")

# set cuda compute capabilities
elif is_cuda_software:
elif is_gpu_software(ec):
# on GPU nodes set cuda compute capabilities
ec['cuda_compute_capabilities'] = ARCHS[LOCAL_ARCH_FULL]['cuda_cc']
ec.log.info(f"[parse hook] Set parameter cuda_compute_capabilities: {ec['cuda_compute_capabilities']}")

Expand Down Expand Up @@ -390,7 +450,10 @@ def pre_configure_hook(self, *args, **kwargs): # pylint: disable=unused-argumen


def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument
"""Hook at pre-module level to alter module files"""
"""
Hook at pre-module level to alter module files
WARNING: this hooks triggers *after* sanity checks
"""

# Must be done this way, updating self.cfg['modextravars']
# directly doesn't work due to templating.
Expand Down Expand Up @@ -449,6 +512,11 @@ def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument
self.cfg['modextravars'].update({'SLURM_ENABLED': "1"})
self.cfg['modextravars'].update({'SCHEDULER_TIGHT_COUPLING': "1"})

if self.name == 'NVHPC':
slurm_mpi_type = 'pmix'
self.log.info("[pre-module hook] Set Slurm MPI type to: %s", slurm_mpi_type)
self.cfg['modextravars'].update({'SLURM_MPI_TYPE': slurm_mpi_type})

##########################
# ------ TUNING -------- #
##########################
Expand Down Expand Up @@ -545,24 +613,6 @@ def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument
else:
self.cfg['docurls'] = [usage_info['link']]

#################################
# ------ DUMMY MODULES -------- #
#################################

is_cuda_software = 'CUDA' in self.name or 'CUDA' in self.cfg['versionsuffix']
if is_cuda_software and LOCAL_ARCH_FULL not in GPU_ARCHS:
self.log.info("[pre-module hook] Creating dummy module for CUDA modules on non-GPU nodes")
self.cfg['modluafooter'] = """
if mode() == "load" and not os.getenv("BUILD_TOOLS_LOAD_DUMMY_MODULES") then
LmodError([[
This module is only available on nodes with a GPU.
Jobs can request GPUs with the command 'srun --gpus-per-node=1' or 'sbatch --gpus-per-node=1'.

More information in the VUB-HPC docs:
https://hpc.vub.be/docs/job-submission/gpu-job-types/#gpu-jobs
]])
end"""


def post_build_and_install_loop_hook(ecs_with_res):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/build_tools/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@author: Alex Domingo (Vrije Universiteit Brussel)
"""

VERSION = '4.3.3'
VERSION = '4.4.0'

AUTHOR = {
'wp': 'Ward Poelmans',
Expand Down
6 changes: 3 additions & 3 deletions tests/test_hooks_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
('fosscuda', '2023a', 'system', 'system', 'Toolchain', False),
],
)
def test_calc_tc_gen(toolchain, set_up_config):
def test_calc_tc_gen_subdir(toolchain, set_up_config):
name, version, tcname, tcversion, easyblock, expected_generation = toolchain
generation, _ = hooks_hydra.calc_tc_gen(name, version, tcname, tcversion, easyblock)
generation, _ = hooks_hydra.calc_tc_gen_subdir(name, version, tcname, tcversion, easyblock)

assert generation == expected_generation
assert generation['toolchains'] == expected_generation
Loading