vub-hpc · lexming · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025
diff --git a/bin/submit_build.py b/bin/submit_build.py
@@ -77,7 +77,7 @@ def main():
     # Easybuild default paths
     # start using environment from local machine, job scripts get custom paths
     ebconf = {
-        'accept-eula-for': 'Intel-oneAPI,CUDA,cuDNN',
+        'accept-eula-for': 'Intel-oneAPI,CUDA,cuDNN,NVHPC',
         'buildpath': os.path.join(job['tmp'], 'eb-submit-build-fetch'),
         'hooks': hooks_hydra.__file__,
         'include-easyblocks': os.path.join(VSCSOFTSTACK_ROOT, EASYBLOCK_REPO),

diff --git a/src/build_tools/hooks_hydra.py b/src/build_tools/hooks_hydra.py
@@ -67,14 +67,26 @@
 LOCAL_ARCH_SUFFIX = os.getenv('VSC_ARCH_SUFFIX')
 LOCAL_ARCH_FULL = f'{LOCAL_ARCH}{LOCAL_ARCH_SUFFIX}'
 
-VALID_TCGENS = ['2024a']
-VALID_MODULES_SUBDIRS = VALID_TCGENS + ['system']
-VALID_TCS = ['foss', 'intel', 'gomkl', 'gimkl', 'gimpi']
+VALID_TOOLCHAINS = {
+    '2024a': {
+        'toolchains': ['foss', 'intel', 'gomkl', 'gimkl', 'gimpi'],
+        'subdir': '2024a',
+    },
+    '25.1': {
+        'toolchains': ['nvidia-compilers', 'NVHPC'],
+        'subdir': '2024a',
+    },
+}
+VALID_MODULES_SUBDIRS = ['system', '2024a']
 
 SUBDIR_MODULES_BWRAP = '.modules_bwrap'
 SUFFIX_MODULES_PATH = 'collection'
 SUFFIX_MODULES_SYMLINK = 'all'
 
+##################
+# MODULE FOOTERS #
+##################
+
 INTEL_MPI_MOD_FOOTER = """
 if ( os.getenv("SLURM_JOB_ID") ) then
     setenv("I_MPI_HYDRA_BOOTSTRAP", "slurm")
@@ -89,6 +101,17 @@
     setenv("JAVA_TOOL_OPTIONS",  "-Xmx" .. math.floor(mem*0.8))
 end
 """
+GPU_DUMMY_MOD_FOOTER = """
+if mode() == "load" and not os.getenv("BUILD_TOOLS_LOAD_DUMMY_MODULES") then
+    LmodError([[
+This module is only available on nodes with a GPU.
+Jobs can request GPUs with the command 'srun --gpus-per-node=1' or 'sbatch --gpus-per-node=1'.
+
+More information in the VUB-HPC docs:
+https://hpc.vub.be/docs/job-submission/gpu-job-types/#gpu-jobs
+    ]])
+end
+"""
 
 
 def get_group(name, version):
@@ -118,22 +141,26 @@ def get_tc_versions():
     update_build_option('hooks', None)
 
     tc_versions = {}
-    for toolcgen in VALID_TCGENS:
-        tc_versions[toolcgen] = []
-        for toolc in VALID_TCS:
+    for tcgen, tcgen_spec in VALID_TOOLCHAINS.items():
+        tcgen_versions = []
+        for tc_name in tcgen_spec['toolchains']:
             try:
-                tc_versions[toolcgen].extend(get_toolchain_hierarchy({'name': toolc, 'version': toolcgen}))
+                tcgen_versions.extend(get_toolchain_hierarchy({'name': tc_name, 'version': tcgen}))
             except EasyBuildError:
                 # skip if no easyconfig found for toolchain-version
                 pass
+        tc_versions[tcgen] = {
+            'toolchains': tcgen_versions,
+            'subdir': tcgen_spec['subdir'],
+        }
 
     update_build_option('hooks', hooks)
     return tc_versions
 
 
-def calc_tc_gen(name, version, tcname, tcversion, easyblock):
+def calc_tc_gen_subdir(name, version, tcname, tcversion, easyblock):
     """
-    calculate the toolchain generation
+    calculate the toolchain generation subdir
     return False if not valid
     """
     name_version = {'name': name, 'version': version}
@@ -143,10 +170,11 @@ def calc_tc_gen(name, version, tcname, tcversion, easyblock):
     tc_versions = get_tc_versions()
 
     # (software with) valid (sub)toolchain-version combination
-    for toolcgen in VALID_TCGENS:
-        if toolchain in tc_versions[toolcgen] or name_version in tc_versions[toolcgen]:
-            log_msg = f"Determined toolchain generation {toolcgen} for {software}"
-            return toolcgen, log_msg
+    for tcgen, tcgen_spec in tc_versions.items():
+        if toolchain in tcgen_spec['toolchains'] or name_version in tcgen_spec['toolchains']:
+            tcgen_subdir = tcgen_spec['subdir']
+            log_msg = f"Determined toolchain generation subdir '{tcgen_subdir}' for {software}"
+            return tcgen_subdir, log_msg
 
     # invalid toolchains
     # all toolchains have 'system' toolchain, so we need to handle the invalid toolchains separately
@@ -157,17 +185,31 @@ def calc_tc_gen(name, version, tcname, tcversion, easyblock):
 
     # software with 'system' toolchain: return 'system'
     if tcname == 'system':
-        log_msg = f"Determined toolchain {tcname} for {software}"
-        return tcname, log_msg
+        tcgen_subdir = 'system'
+        log_msg = f"Determined toolchain '{tcgen_subdir}' for {software}"
+        return tcgen_subdir, log_msg
 
     log_msg = f"Invalid toolchain {tcname} and/or toolchain version {tcversion} for {software}"
     return False, log_msg
 
 
+def is_gpu_software(ec):
+    "determine if it is a GPU-only installation"
+    gpu_components = ['CUDA']
+    gpu_toolchains = ['nvidia-compilers', 'NVHPC']
+
+    is_gpu_package = ec.name in gpu_components or ec.name in gpu_toolchains
+    needs_gpu_toolchain = ec.toolchain.name in gpu_toolchains
+    needs_gpu_component = any([x in ec['versionsuffix'] for x in gpu_components])
+
+    return  is_gpu_package or needs_gpu_toolchain or needs_gpu_component
+
+
 def update_moduleclass(ec):
     "update the moduleclass of an easyconfig to <tc_gen>/all"
-    tc_gen, log_msg = calc_tc_gen(
-        ec.name, ec.version, ec.toolchain.name, ec.toolchain.version, ec.easyblock)
+    tc_gen, log_msg = calc_tc_gen_subdir(
+        ec.name, ec.version, ec.toolchain.name, ec.toolchain.version, ec.easyblock
+    )
 
     if not tc_gen:
         raise EasyBuildError("[parse hook] " + log_msg)
@@ -276,6 +318,14 @@ def parse_hook(ec, *args, **kwargs):  # pylint: disable=unused-argument
         ec['dependencies'] = [d for d in ec['dependencies'] if 'libfabric' not in d]
         ec.log.info("[parse hook] Removed libfabric from dependency list")
 
+    if ec.name == 'NVHPC':
+        # NVHPC ships with OpenMPI v4 which has an issue between its hwloc
+        # and Slurm cgroups2 that results in mpirun trying to use unallocated
+        # cores to the job (see https://github.com/open-mpi/ompi/issues/12470)
+        # Only mpirun is affected, workaround is to set '--bind-to=none':
+        ec.log.info("[parse hook] Disable mpirun process binding in NVHPC")
+        ec['modextravars'].update({'OMPI_MCA_hwloc_base_binding_policy': 'none'})
+
     if ec.name == 'Gurobi':
         # use centrally installed Gurobi license file, and don't copy to installdir
         ec['license_file'] = '/apps/brussel/licenses/gurobi/gurobi.lic'
@@ -310,17 +360,27 @@ def parse_hook(ec, *args, **kwargs):  # pylint: disable=unused-argument
             ec.toolchain.options['optarch'] = optarchs_intel[LOCAL_ARCH]
             ec.log.info(f"[parse hook] Set optarch in parameter toolchainopts: {ec.toolchain.options['optarch']}")
 
-    # skip installation of CUDA software in non-GPU architectures, only create module file
-    is_cuda_software = 'CUDA' in ec.name or 'CUDA' in ec['versionsuffix']
-    if is_cuda_software and LOCAL_ARCH_FULL not in GPU_ARCHS:
+    ###############################
+    # ------ GPU MODULES -------- #
+    ###############################
+
+    # skip installation of CUDA software in non-GPU architectures, only create a dummy module file
+    if is_gpu_software(ec) and LOCAL_ARCH_FULL not in GPU_ARCHS:
+        ec.log.info("[parse hook] Generating dummy GPU module on non-GPU node")
+        # inject error message in module file
+        ec['modluafooter'] = GPU_DUMMY_MOD_FOOTER
+        # workaround for NVHPC
+        if ec.name == 'NVHPC':
+            ec['default_cuda_version'] = '0'
         # module_only steps: [MODULE_STEP, PREPARE_STEP, READY_STEP, POSTITER_STEP, SANITYCHECK_STEP]
         ec['module_only'] = True
         ec.log.info(f"[parse hook] Set parameter module_only: {ec['module_only']}")
         ec['skipsteps'] = [SANITYCHECK_STEP]
         ec.log.info(f"[parse hook] Set parameter skipsteps: {ec['skipsteps']}")
 
     # set cuda compute capabilities
-    elif is_cuda_software:
+    elif is_gpu_software(ec):
+        # on GPU nodes set cuda compute capabilities
         ec['cuda_compute_capabilities'] = ARCHS[LOCAL_ARCH_FULL]['cuda_cc']
         ec.log.info(f"[parse hook] Set parameter cuda_compute_capabilities: {ec['cuda_compute_capabilities']}")
 
@@ -390,7 +450,10 @@ def pre_configure_hook(self, *args, **kwargs):  # pylint: disable=unused-argumen
 
 
 def pre_module_hook(self, *args, **kwargs):  # pylint: disable=unused-argument
-    """Hook at pre-module level to alter module files"""
+    """
+    Hook at pre-module level to alter module files
+    WARNING: this hooks triggers *after* sanity checks
+    """
 
     # Must be done this way, updating self.cfg['modextravars']
     # directly doesn't work due to templating.
@@ -449,6 +512,11 @@ def pre_module_hook(self, *args, **kwargs):  # pylint: disable=unused-argument
             self.cfg['modextravars'].update({'SLURM_ENABLED': "1"})
             self.cfg['modextravars'].update({'SCHEDULER_TIGHT_COUPLING': "1"})
 
+        if self.name == 'NVHPC':
+            slurm_mpi_type = 'pmix'
+            self.log.info("[pre-module hook] Set Slurm MPI type to: %s", slurm_mpi_type)
+            self.cfg['modextravars'].update({'SLURM_MPI_TYPE': slurm_mpi_type})
+
         ##########################
         # ------ TUNING -------- #
         ##########################
@@ -545,24 +613,6 @@ def pre_module_hook(self, *args, **kwargs):  # pylint: disable=unused-argument
             else:
                 self.cfg['docurls'] = [usage_info['link']]
 
-        #################################
-        # ------ DUMMY MODULES -------- #
-        #################################
-
-        is_cuda_software = 'CUDA' in self.name or 'CUDA' in self.cfg['versionsuffix']
-        if is_cuda_software and LOCAL_ARCH_FULL not in GPU_ARCHS:
-            self.log.info("[pre-module hook] Creating dummy module for CUDA modules on non-GPU nodes")
-            self.cfg['modluafooter'] = """
-if mode() == "load" and not os.getenv("BUILD_TOOLS_LOAD_DUMMY_MODULES") then
-    LmodError([[
-This module is only available on nodes with a GPU.
-Jobs can request GPUs with the command 'srun --gpus-per-node=1' or 'sbatch --gpus-per-node=1'.
-
-More information in the VUB-HPC docs:
-https://hpc.vub.be/docs/job-submission/gpu-job-types/#gpu-jobs
-    ]])
-end"""
-
 
 def post_build_and_install_loop_hook(ecs_with_res):
     """

diff --git a/src/build_tools/package.py b/src/build_tools/package.py
@@ -16,7 +16,7 @@
 @author: Alex Domingo (Vrije Universiteit Brussel)
 """
 
-VERSION = '4.3.3'
+VERSION = '4.4.0'
 
 AUTHOR = {
     'wp': 'Ward Poelmans',

diff --git a/tests/test_hooks_hydra.py b/tests/test_hooks_hydra.py
@@ -44,8 +44,8 @@
         ('fosscuda', '2023a', 'system', 'system', 'Toolchain', False),
     ],
 )
-def test_calc_tc_gen(toolchain, set_up_config):
+def test_calc_tc_gen_subdir(toolchain, set_up_config):
     name, version, tcname, tcversion, easyblock, expected_generation = toolchain
-    generation, _ = hooks_hydra.calc_tc_gen(name, version, tcname, tcversion, easyblock)
+    generation, _ = hooks_hydra.calc_tc_gen_subdir(name, version, tcname, tcversion, easyblock)
 
-    assert generation == expected_generation
+    assert generation['toolchains'] == expected_generation