diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf index 7622bdffef..3ea79ca8ba 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf @@ -28,6 +28,21 @@ locals { universe_domain = { "universe_domain" = var.universe_domain } } +locals { + # Large size cluster with dynamic nodes can cause frequent controller reconfigures if topology plugin is enabled. + total_nodes = sum([for ns in var.nodeset : (ns.node_count_dynamic_max)]) + + cloud_parameters_override = { + topology_plugin = (var.cloud_parameters.topology_plugin == null && local.total_nodes > 50) ? "" : var.cloud_parameters.topology_plugin + suspend_timeout = (var.cloud_parameters.suspend_timeout == null && local.total_nodes > 50) ? 500 : var.cloud_parameters.suspend_timeout + } + + cloud_parameters = merge( + var.cloud_parameters, + local.cloud_parameters_override + ) +} + # See # * slurm_files.tf # * controller.tf diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 32a7be1ad4..b12ee0cd77 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -141,7 +141,7 @@ module "slurm_files" { slurm_conf_tpl = var.slurm_conf_tpl slurm_conf_template = var.slurm_conf_template cgroup_conf_tpl = var.cgroup_conf_tpl - cloud_parameters = var.cloud_parameters + cloud_parameters = local.cloud_parameters cloudsql_secret = try( one(google_secret_manager_secret_version.cloudsql_version[*].id), null)