Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions community/examples/slurm-gke/files/cgroup.conf.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# cgroup.conf
# https://slurm.schedmd.com/cgroup.conf.html

CgroupPlugin=autodetect
IgnoreSystemd=yes
# EnableControllers=yes
ConstrainCores=yes
ConstrainRamSpace=yes
ConstrainSwapSpace=no
ConstrainDevices=yes
18 changes: 18 additions & 0 deletions community/examples/slurm-gke/files/slurm-namespace.yaml.tftpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 "Google LLC"
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: v1
kind: Namespace
metadata:
name: ${namespace}
192 changes: 192 additions & 0 deletions community/examples/slurm-gke/slurm-gke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

blueprint_name: slurm-gke

vars:
# The following variables should be over-written in the deployment.yaml file.
# Your GCP Project ID
project_id: ## Set GCP Project ID Here ##

# This should be unique across all of your Cluster
# Toolkit Deployments.
deployment_name: slurmgke
# The GCP Region used for this deployment.
region:

# The GCP Zone used for this deployment.
zone:

# Cidr block containing the IP of the machine calling terraform.
# To allow all (IAM restrictions still enforced), use 0.0.0.0/0
# To allow only your IP address, use <YOUR-IP-ADDRESS>/32
authorized_cidr:

# The number of nodes to be created for the Slurm GKE NodeSet.
gke_nodeset_replicas: 2


# The pre-built Slinky Image for GKE Nodeset.
# Follow instruction in ./images/containers to build this image.
slinky_image: ghcr.io/slinkyproject/slurmd-pyxis:24.11-ubuntu24.04

# Namespace where Slurm GKE NodeSet will be created
slurm_namespace: slurm

deployment_groups:
- group: primary
modules:

###### Common resources ######

- id: network
source: modules/network/vpc
settings:
subnetwork_name: $(vars.deployment_name)-subnet
secondary_ranges_list:
- subnetwork_name: $(vars.deployment_name)-subnet
ranges:
- range_name: pods
ip_cidr_range: 10.4.0.0/14
- range_name: services
ip_cidr_range: 10.0.32.0/20

- id: private_service_access
source: community/modules/network/private-service-access
use: [network]

- id: homefs
source: modules/file-system/filestore
use: [network, private_service_access]
settings:
local_mount: /home

###### GKE Setup ######

- id: gke_service_account
source: community/modules/project/service-account
settings:
name: slinky-gke-sa
project_roles:
- logging.logWriter
- monitoring.metricWriter
- monitoring.viewer
- stackdriver.resourceMetadata.writer
- storage.objectAdmin
- artifactregistry.reader

- id: gke_cluster
source: modules/scheduler/gke-cluster
use: [network, gke_service_account]
settings:
enable_private_endpoint: false
enable_gcsfuse_csi: true
enable_filestore_csi: true
master_authorized_networks:
- cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
display_name: "kubectl-access-network"
system_node_pool_enabled: false
configure_workload_identity_sa: true
enable_dcgm_monitoring: true
outputs: [instructions]

- id: gke_base_pool
source: modules/compute/gke-node-pool
use: [gke_cluster, gke_service_account]
settings:
initial_node_count: 1
disk_type: pd-balanced
machine_type: e2-standard-4
zones: [$(vars.zone)]

- id: gke_compute_pool
source: modules/compute/gke-node-pool
use: [gke_cluster, gke_service_account]
settings:
name: gke-compute-pool
initial_node_count: $(vars.gke_nodeset_replicas)
disk_type: pd-balanced
machine_type: c2-standard-16
zones: [$(vars.zone)]

- id: gke_ns_manifest
source: modules/management/kubectl-apply
use: [gke_cluster]
settings:
apply_manifests:
- source: $(ghpc_stage("./files/slurm-namespace.yaml.tftpl"))
template_vars:
namespace: $(vars.slurm_namespace)

- id: slinky
source: community/modules/scheduler/slinky
use:
- gke_cluster
- gke_base_pool # Optionally specify nodepool(s) to avoid operator components running on HPC hardware
settings:
slurm_operator_namespace: $(vars.slurm_namespace)
install_slurm_operator_chart: true
install_slurm_chart: false

- id: gke_compute_nodeset
source: community/modules/compute/gke-nodeset
use: [gke_compute_pool, slinky, homefs, slurm_controller, network]
settings:
slurm_cluster_name: $(vars.deployment_name)
image: $(vars.slinky_image)

- id: gke_compute_partition
source: community/modules/compute/gke-partition
use: [slurm_controller, gke_compute_nodeset]

###### GCE Setup ######

- id: debug_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
node_count_dynamic_max: 4
machine_type: n2-standard-2
allow_automatic_updates: false

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- debug_nodeset
settings:
partition_name: debug
exclusive: false # allows nodes to stay up after jobs are done
is_default: true
suspend_time: -1 # prevents nodes from suspending while it's idle

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
machine_type: n2-standard-4
enable_login_public_ips: true

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- slurm_login
- debug_partition
- homefs
settings:
slurm_cluster_name: $(vars.deployment_name)
enable_slurm_auth: true
cgroup_conf_tpl: $(ghpc_stage("./files/cgroup.conf.tpl"))
enable_controller_public_ips: true
55 changes: 55 additions & 0 deletions community/modules/compute/gke-nodeset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
## Requirements

| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 4.84 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_google"></a> [google](#provider\_google) | >= 4.84 |

## Modules

| Name | Source | Version |
|------|--------|---------|
| <a name="module_home_pv"></a> [home\_pv](#module\_home\_pv) | ../../../../modules/file-system/gke-persistent-volume | n/a |
| <a name="module_kubectl_apply"></a> [kubectl\_apply](#module\_kubectl\_apply) | ../../../../modules/management/kubectl-apply | n/a |
| <a name="module_slurm_key_pv"></a> [slurm\_key\_pv](#module\_slurm\_key\_pv) | ../../../../modules/file-system/gke-persistent-volume | n/a |

## Resources

| Name | Type |
|------|------|
| [google_storage_bucket_object.gke_nodeset_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource |
| [google_storage_bucket.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/storage_bucket) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |
| <a name="input_filestore_id"></a> [filestore\_id](#input\_filestore\_id) | An array of identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `list(string)` | n/a | yes |
| <a name="input_image"></a> [image](#input\_image) | The image for slurm daemon | `string` | n/a | yes |
| <a name="input_instance_templates"></a> [instance\_templates](#input\_instance\_templates) | The URLs of Instance Templates | `list(string)` | n/a | yes |
| <a name="input_network_storage"></a> [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. | <pre>list(object({<br/> server_ip = string,<br/> remote_mount = string,<br/> local_mount = string,<br/> fs_type = string,<br/> mount_options = string,<br/> client_install_runner = map(string)<br/> mount_runner = map(string)<br/> }))</pre> | n/a | yes |
| <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | The number of static nodes in node-pool | `number` | n/a | yes |
| <a name="input_node_pool_names"></a> [node\_pool\_names](#input\_node\_pool\_names) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `list(string)` | n/a | yes |
| <a name="input_nodeset_name"></a> [nodeset\_name](#input\_nodeset\_name) | The nodeset name | `string` | `"gkenodeset"` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes |
| <a name="input_slurm_bucket"></a> [slurm\_bucket](#input\_slurm\_bucket) | GCS Bucket of Slurm cluster file storage. | `any` | n/a | yes |
| <a name="input_slurm_bucket_dir"></a> [slurm\_bucket\_dir](#input\_slurm\_bucket\_dir) | Path directory within `bucket_name` for Slurm cluster file storage. | `string` | n/a | yes |
| <a name="input_slurm_cluster_name"></a> [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used in slurm controller | `string` | n/a | yes |
| <a name="input_slurm_controller_instance"></a> [slurm\_controller\_instance](#input\_slurm\_controller\_instance) | Slurm cluster controller instance | `any` | n/a | yes |
| <a name="input_slurm_namespace"></a> [slurm\_namespace](#input\_slurm\_namespace) | slurm namespace for charts | `string` | `"slurm"` | no |
| <a name="input_subnetwork"></a> [subnetwork](#input\_subnetwork) | Primary subnetwork object | `any` | n/a | yes |

## Outputs

| Name | Description |
|------|-------------|
| <a name="output_nodeset_name"></a> [nodeset\_name](#output\_nodeset\_name) | Name of the new Slinky nodset |
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
64 changes: 64 additions & 0 deletions community/modules/compute/gke-nodeset/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


### GKE NodeSet
locals {
manifest_path = "${path.module}/templates/nodeset-general.yaml.tftpl"
}

module "kubectl_apply" {
source = "../../../../modules/management/kubectl-apply"

cluster_id = var.cluster_id
project_id = var.project_id

apply_manifests = [{
source = local.manifest_path,
template_vars = {
slurm_namespace = var.slurm_namespace,
nodeset_name = "${var.slurm_cluster_name}-${var.nodeset_name}",
nodeset_cr_name = "${var.slurm_cluster_name}-${var.nodeset_name}",
controller_name = "${var.slurm_cluster_name}-controller",
node_pool_name = var.node_pool_names[0],
node_count = var.node_count_static,
image = var.image,
home_pvc = module.home_pv.pvc_name
slurm_key_pvc = module.slurm_key_pv.pvc_name
}
}]
}

data "google_storage_bucket" "this" {
name = var.slurm_bucket[0].name

depends_on = [var.slurm_bucket]
}

### Slurm NodeSet
locals {
nodeset = {
gke_nodepool = var.node_pool_names[0]
nodeset_name = var.nodeset_name
node_count_static = var.node_count_static
subnetwork = "https://www.googleapis.com/compute/v1/projects/${var.project_id}/regions/${var.subnetwork.region}/subnetworks/${var.subnetwork.name}"
instance_template = var.instance_templates[0]
}
}

resource "google_storage_bucket_object" "gke_nodeset_config" {
bucket = data.google_storage_bucket.this.name
name = "${var.slurm_bucket_dir}/nodeset_configs/${var.nodeset_name}.yaml"
content = yamlencode(local.nodeset)
}
20 changes: 20 additions & 0 deletions community/modules/compute/gke-nodeset/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

spec:
requirements:
services:
- container.googleapis.com
- storage.googleapis.com
18 changes: 18 additions & 0 deletions community/modules/compute/gke-nodeset/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

output "nodeset_name" {
description = "Name of the new Slinky nodset"
value = local.nodeset.nodeset_name
}
Loading
Loading