From 641e784f747da3a66c6ffa3330e8b721a04f4b58 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Dec 2023 11:11:40 +0000 Subject: [PATCH 001/151] Bump github.com/hashicorp/terraform-exec from 0.19.0 to 0.20.0 Bumps [github.com/hashicorp/terraform-exec](https://github.com/hashicorp/terraform-exec) from 0.19.0 to 0.20.0. - [Release notes](https://github.com/hashicorp/terraform-exec/releases) - [Changelog](https://github.com/hashicorp/terraform-exec/blob/main/CHANGELOG.md) - [Commits](https://github.com/hashicorp/terraform-exec/compare/v0.19.0...v0.20.0) --- updated-dependencies: - dependency-name: github.com/hashicorp/terraform-exec dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 4 ++-- go.sum | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index b7d4b9a1bc..f47ea1ccce 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,7 @@ require ( github.com/fatih/color v1.16.0 github.com/go-git/go-billy/v5 v5.5.0 github.com/google/go-cmp v0.6.0 - github.com/hashicorp/terraform-exec v0.19.0 + github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b google.golang.org/api v0.154.0 @@ -40,7 +40,7 @@ require ( github.com/go-logr/logr v1.3.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/googleapis/gax-go/v2 v2.12.0 // indirect - github.com/hashicorp/terraform-json v0.17.1 // indirect + github.com/hashicorp/terraform-json v0.19.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect diff --git a/go.sum b/go.sum index 863f14e451..2608c8e1c7 100644 --- a/go.sum +++ b/go.sum @@ -383,17 +383,17 @@ github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mO github.com/hashicorp/go-version v1.6.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/hc-install v0.6.0 h1:fDHnU7JNFNSQebVKYhHZ0va1bC6SrPQ8fpebsvNr2w4= +github.com/hashicorp/hc-install v0.6.2 h1:V1k+Vraqz4olgZ9UzKiAcbman9i9scg9GgSt/U3mw/M= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/hcl/v2 v2.19.1 h1://i05Jqznmb2EXqa39Nsvyan2o5XyMowW5fnCKW5RPI= github.com/hashicorp/hcl/v2 v2.19.1/go.mod h1:ThLC89FV4p9MPW804KVbe/cEXoQ8NZEh+JtMeeGErHE= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d h1:g6kHlvZrFPFKeWRj5q/zyJA5gu7rlJGPf17h8hX7LHY= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d/go.mod h1:l8HcFPm9cQh6Q0KSWoYPiePqMvRFenybP1CH2MjKdlg= -github.com/hashicorp/terraform-exec v0.19.0 h1:FpqZ6n50Tk95mItTSS9BjeOVUb4eg81SpgVtZNNtFSM= -github.com/hashicorp/terraform-exec v0.19.0/go.mod h1:tbxUpe3JKruE9Cuf65mycSIT8KiNPZ0FkuTE3H4urQg= -github.com/hashicorp/terraform-json v0.17.1 h1:eMfvh/uWggKmY7Pmb3T85u86E2EQg6EQHgyRwf3RkyA= -github.com/hashicorp/terraform-json v0.17.1/go.mod h1:Huy6zt6euxaY9knPAFKjUITn8QxUFIe9VuSzb4zn/0o= +github.com/hashicorp/terraform-exec v0.20.0 h1:DIZnPsqzPGuUnq6cH8jWcPunBfY+C+M8JyYF3vpnuEo= +github.com/hashicorp/terraform-exec v0.20.0/go.mod h1:ckKGkJWbsNqFKV1itgMnE0hY9IYf1HoiekpuN0eWoDw= +github.com/hashicorp/terraform-json v0.19.0 h1:e9DBKC5sxDfiJT7Zoi+yRIwqLVtFur/fwK/FuE6AWsA= +github.com/hashicorp/terraform-json v0.19.0/go.mod h1:qdeBs11ovMzo5puhrRibdD6d2Dq6TyE/28JiU4tIQxk= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= From 3d1072da48450aa22b844bb5c288415b270616cc Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 2 Jan 2024 14:51:06 -0800 Subject: [PATCH 002/151] Make `subnetwork_self_link` required, don't pass `subnetwork_project` around (#2067) --- .../README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-tpu/main.tf | 7 ++++--- .../variables.tf | 1 - .../schedmd-slurm-gcp-v6-nodeset/README.md | 3 +-- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 3 +-- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 7 ------- .../schedmd-slurm-gcp-v6-partition/README.md | 4 ++-- .../variables.tf | 5 ++--- .../schedmd-slurm-gcp-v6-controller/README.md | 9 ++++----- .../controller.tf | 4 +--- .../schedmd-slurm-gcp-v6-controller/login.tf | 1 - .../partition.tf | 6 +++--- .../variables.tf | 19 ++++++++----------- .../variables_controller_instance.tf | 9 +-------- .../schedmd-slurm-gcp-v6-login/README.md | 3 +-- .../schedmd-slurm-gcp-v6-login/main.tf | 6 ++---- .../schedmd-slurm-gcp-v6-login/variables.tf | 9 +-------- 17 files changed, 32 insertions(+), 66 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 7cd9719872..f5219c6831 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -68,7 +68,7 @@ No resources. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no | | [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | n/a | yes | | [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | `"2.9.1"` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf index 18cccb4f4d..900d1d35b0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf @@ -34,8 +34,9 @@ locals { docker_image = var.docker_image enable_public_ip = !var.disable_public_ips - subnetwork = var.subnetwork_self_link - service_account = var.service_account - zone = var.zone + # TODO: rename to subnetwork_self_link, requires changes to the scripts + subnetwork = var.subnetwork_self_link + service_account = var.service_account + zone = var.zone } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index ac13f2dc5d..0295f596a2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -104,7 +104,6 @@ variable "docker_image" { variable "subnetwork_self_link" { type = string description = "The name of the subnetwork to attach the TPU-vm of this nodeset to." - default = null } variable "service_account" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 829e9796da..bcc0ad87c8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -185,8 +185,7 @@ No modules. | [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | | [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 11acd4b963..e87e20da0a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -67,8 +67,7 @@ locals { source_image_family = local.source_image_family # requires source_image_logic.tf source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link + subnetwork_self_link = var.subnetwork_self_link tags = var.tags spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index d00219cf73..d8c70b4802 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -378,11 +378,4 @@ EOD variable "subnetwork_self_link" { type = string description = "Subnet to deploy to." - default = null -} - -variable "subnetwork_project" { - description = "The project the subnetwork belongs to." - type = string - default = "" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index c482f55f35..7b68a670fd 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -85,8 +85,8 @@ No resources. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 56fda6e4d6..1cc821a878 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -113,8 +113,7 @@ variable "nodeset" { source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) - subnetwork_project = optional(string) - subnetwork = optional(string) + subnetwork_self_link = string spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) @@ -150,7 +149,7 @@ variable "nodeset_tpu" { zone = string data_disks = optional(list(string), []) docker_image = optional(string, "") - subnetwork = optional(string, "") + subnetwork = string service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 2f161702bd..0249667267 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -190,15 +190,15 @@ limitations under the License. | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | -| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | | [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
# TODO: rename to subnetwork_self_link
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | | [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | @@ -211,8 +211,7 @@ limitations under the License. | [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | | [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 030b433f18..6802694342 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -76,8 +76,7 @@ module "slurm_controller_template" { source_image = local.source_image # requires source_image_logic.tf # spot = TODO: add support for spot (?) - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link + subnetwork = var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) # termination_action = TODO: add support for termination_action (?) @@ -105,7 +104,6 @@ module "slurm_controller_instance" { slurm_cluster_name = local.slurm_cluster_name slurm_instance_role = "controller" static_ips = var.static_ips - subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link zone = var.zone diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index ee8d354670..839a6c238c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -52,7 +52,6 @@ module "slurm_login_template" { source_image_project = each.value.source_image_project source_image = each.value.source_image spot = each.value.spot - subnetwork_project = each.value.subnetwork_project subnetwork = each.value.subnetwork tags = concat([local.slurm_cluster_name], each.value.tags) termination_action = each.value.termination_action diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 68ddbabcc4..d8a581335b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -53,7 +53,7 @@ module "slurm_nodeset_template" { source_image_family = each.value.source_image_family source_image_project = each.value.source_image_project source_image = each.value.source_image - subnetwork = each.value.subnetwork + subnetwork = each.value.subnetwork_self_link tags = concat([local.slurm_cluster_name], each.value.tags) } @@ -70,7 +70,7 @@ module "slurm_nodeset" { node_count_static = each.value.node_count_static nodeset_name = each.value.nodeset_name node_conf = each.value.node_conf - subnetwork_self_link = each.value.subnetwork + subnetwork_self_link = each.value.subnetwork_self_link zones = each.value.zones zone_target_shape = each.value.zone_target_shape } @@ -94,7 +94,7 @@ module "slurm_nodeset_tpu" { service_account = each.value.service_account data_disks = each.value.data_disks docker_image = each.value.docker_image - subnetwork = each.value.subnetwork + subnetwork = each.value.subnetwork_self_link } # PARTITION diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 9fe20fac1e..38c9f03546 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -142,8 +142,7 @@ variable "login_nodes" { source_image_project = optional(string) source_image = optional(string) static_ips = optional(list(string), []) - subnetwork_project = optional(string) - subnetwork = optional(string) + subnetwork = string spot = optional(bool, false) tags = optional(list(string), []) zone = optional(string) @@ -213,14 +212,12 @@ variable "nodeset" { source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) - subnetwork_project = optional(string) - # TODO: rename to subnetwork_self_link - subnetwork = optional(string) - spot = optional(bool, false) - tags = optional(list(string), []) - termination_action = optional(string) - zones = optional(list(string), []) - zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + subnetwork_self_link = string + spot = optional(bool, false) + tags = optional(list(string), []) + termination_action = optional(string) + zones = optional(list(string), []) + zone_target_shape = optional(string, "ANY_SINGLE_ZONE") })) default = [] @@ -252,7 +249,7 @@ variable "nodeset_tpu" { zone = string data_disks = optional(list(string), []) docker_image = optional(string, "") - subnetwork = optional(string, "") + subnetwork = string service_account = optional(object({ email = optional(string) scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 2d643f79de..9a3f937557 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -284,12 +284,5 @@ variable "tags" { variable "subnetwork_self_link" { type = string - description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null + description = "Subnet to deploy to." } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 3849dcb8f6..ceca9d9365 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -113,8 +113,7 @@ No modules. | [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index 1964828cad..e894aed64f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -77,9 +77,7 @@ locals { static_ips = var.static_ips bandwidth_tier = var.bandwidth_tier - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link - - tags = var.tags + subnetwork = var.subnetwork_self_link + tags = var.tags } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 76957f5680..f628791750 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -315,12 +315,5 @@ variable "tags" { variable "subnetwork_self_link" { type = string - description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null + description = "Subnet to deploy to." } From f9086a7ff9374d0f9e5db8c88e0703fc10bf620e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 2 Jan 2024 16:41:48 -0800 Subject: [PATCH 003/151] Slurm6. Automagicaly set `nodeset.name` from module id. (#2068) Slurm6. Automagicaly set `nodeset.name` from module id. --- community/examples/hpc-slurm6.yaml | 1 - .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf | 4 +++- .../compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml | 2 ++ .../compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 8 +------- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/community/examples/hpc-slurm6.yaml b/community/examples/hpc-slurm6.yaml index 923691eda4..cf6a15b072 100644 --- a/community/examples/hpc-slurm6.yaml +++ b/community/examples/hpc-slurm6.yaml @@ -41,7 +41,6 @@ deployment_groups: source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network] settings: - name: ns1 node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_placement: false # the default is: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index bcc0ad87c8..c754ba63ab 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -175,7 +175,7 @@ No modules. | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name](#input\_name) | Name of the nodeset. | `string` | `"ghpc"` | no | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set | `string` | n/a | yes | | [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index e87e20da0a..01eb6cb240 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -18,6 +18,8 @@ locals { } locals { + name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 6) + additional_disks = [ for ad in var.additional_disks : { disk_name = ad.disk_name @@ -34,7 +36,7 @@ locals { node_count_static = var.node_count_static node_count_dynamic_max = var.node_count_dynamic_max node_conf = var.node_conf - nodeset_name = var.name + nodeset_name = local.name disk_auto_delete = var.disk_auto_delete disk_labels = merge(local.labels, var.disk_labels) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml index 641832182d..929eeecaf0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + inject_module_id: name diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index d8c70b4802..7d7d964840 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -13,14 +13,8 @@ # limitations under the License. variable "name" { - description = "Name of the nodeset." + description = "Name of the nodeset. Automatically populated by the module id if not set" type = string - default = "ghpc" - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) - error_message = "Nodeset name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." - } } variable "node_conf" { From bf1cebb1d7c832f4d87cfb2515476b01bfa500c2 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 2 Jan 2024 22:10:18 -0800 Subject: [PATCH 004/151] VPC. Replace `s/_/-/` in `deployment_name` to avoid deploy-time error (#2083) --- modules/network/vpc/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/network/vpc/main.tf b/modules/network/vpc/main.tf index ce40c0f399..a5322c8f02 100644 --- a/modules/network/vpc/main.tf +++ b/modules/network/vpc/main.tf @@ -15,8 +15,9 @@ */ locals { - network_name = var.network_name == null ? "${var.deployment_name}-net" : var.network_name - subnetwork_name = var.subnetwork_name == null ? "${var.deployment_name}-primary-subnet" : var.subnetwork_name + autoname = replace(var.deployment_name, "_", "-") + network_name = var.network_name == null ? "${local.autoname}-net" : var.network_name + subnetwork_name = var.subnetwork_name == null ? "${local.autoname}-primary-subnet" : var.subnetwork_name # define a default subnetwork for cases in which no explicit subnetworks are # defined in var.subnetworks From 04027758cde0e6115cdc7ca5759e1e79ea00871b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 3 Jan 2024 10:39:37 -0800 Subject: [PATCH 005/151] Add `hpc-slurm6.yaml` to `examples/README` (#2084) --- .pre-commit-config.yaml | 3 ++- examples/README.md | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7e9acd152..3501b4095f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -101,7 +101,8 @@ repos: # MD034 - Bare URL used # MD041 - First line in file should be a top level header # MD046 - Code block style - args: [--disable-rules, "MD013,MD022,MD033,MD034,MD041,MD046", scan] + # MD024 - Multiple headings cannot contain the same content. + args: [--disable-rules, "MD013,MD022,MD033,MD034,MD041,MD046,MD024", scan] - repo: https://github.com/jumanjihouse/pre-commit-hooks rev: "3.0.0" hooks: diff --git a/examples/README.md b/examples/README.md index 6acd823bde..e7125276f4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,6 +13,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] + * [hpc-slurm6.yaml](#hpc-slurm6yaml-) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] @@ -264,6 +265,35 @@ to 256 [hpc-enterprise-slurm.yaml]: ./hpc-enterprise-slurm.yaml +### [hpc-slurm6.yaml] ![community-badge] ![experimental-badge] + +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. +> +> ```shell +> # Install Python3 and run +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt +> ``` + +Creates a basic auto-scaling Slurm cluster with mostly default settings. The +blueprint also creates a new VPC network, and a filestore instance mounted to +`/home`. + +There are 2 partitions in this example: `debug`, and `compute`. The `debug` +partition uses `n2-standard-2` VMs, which should work out of the box without +needing to request additional quota. The purpose of the `debug` partition is to +make sure that first time users are not immediately blocked by quota +limitations. + +[hpc-slurm6.yaml]: ../community/examples/hpc-slurm6.yaml + +#### Compute Partition + +There is a `compute` partition that achieves higher performance. Any +performance analysis should be done on the `compute` partition. By default it +uses `c2-standard-60` VMs with placement groups enabled. You may need to request +additional quota for `C2 CPUs` in the region you are deploying in. You can +select the compute partition using the `-p compute` argument when running `srun`. + ### [ml-slurm.yaml] ![core-badge] This blueprint provisions an HPC cluster running the Slurm scheduler with the From df1e26619f5ba9edaa5e91b42527d7fbfe4f40d9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 3 Jan 2024 11:43:21 -0800 Subject: [PATCH 006/151] Slurm6. QuickFix broken TPU nodeset usage (#2086) --- .../scheduler/schedmd-slurm-gcp-v6-controller/partition.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index d8a581335b..7d53d50327 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -94,7 +94,7 @@ module "slurm_nodeset_tpu" { service_account = each.value.service_account data_disks = each.value.data_disks docker_image = each.value.docker_image - subnetwork = each.value.subnetwork_self_link + subnetwork = each.value.subnetwork } # PARTITION From a8759da9505bdd4e75bb5e601d87952765ebd2e3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 3 Jan 2024 14:19:24 -0800 Subject: [PATCH 007/151] Slurm6. Reference TPU example in `examples/README` (#2087) --- examples/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/README.md b/examples/README.md index e7125276f4..a677b8ffb5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -14,6 +14,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] * [hpc-slurm6.yaml](#hpc-slurm6yaml-) ![community-badge] ![experimental-badge] + * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml-) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] @@ -294,6 +295,19 @@ uses `c2-standard-60` VMs with placement groups enabled. You may need to request additional quota for `C2 CPUs` in the region you are deploying in. You can select the compute partition using the `-p compute` argument when running `srun`. +### [hpc-slurm6-tpu.yaml] ![community-badge] ![experimental-badge] + +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. +> +> ```shell +> # Install Python3 and run +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt +> ``` + +Creates an auto-scaling Slurm cluster with TPU nodes. + +[hpc-slurm6-tpu.yaml]: ../community/examples/hpc-slurm6-tpu.yaml + ### [ml-slurm.yaml] ![core-badge] This blueprint provisions an HPC cluster running the Slurm scheduler with the From 2ad353bb6c5d4073a1e3a50a39d5432ef7c36ab2 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 4 Jan 2024 12:14:19 -0800 Subject: [PATCH 008/151] Use `cty.Type` instead of `string` to represent type of vars. (#2088) NOTE: after this change instead of `list` the `list(any)` will be used. --- pkg/config/config_test.go | 19 +---- pkg/config/expand.go | 7 +- pkg/config/expand_test.go | 19 ++--- pkg/inspect/modules_test.go | 8 +- pkg/modulereader/hcl_utils.go | 30 ++++++-- pkg/modulereader/resreader.go | 3 +- pkg/modulereader/resreader_test.go | 9 ++- pkg/modulewriter/modulewriter_test.go | 102 +++++++------------------- pkg/modulewriter/tfwriter.go | 24 +++--- 9 files changed, 87 insertions(+), 134 deletions(-) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 80602b5269..c1aa89d8a7 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -179,14 +179,8 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { testModuleInfo0 := modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{ - { - Name: "deployment_name", - Type: "string", - }, - { - Name: altProjectIDSetting, - Type: "string", - }, + {Name: "deployment_name", Type: cty.String}, + {Name: altProjectIDSetting, Type: cty.String}, }, Outputs: []modulereader.OutputInfo{ { @@ -220,13 +214,8 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { testModuleInfo2 := modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{ - { - Name: "deployment_name", - Type: "string", - }, - { - Name: matchingIntergroupName, - }, + {Name: "deployment_name", Type: cty.String}, + {Name: matchingIntergroupName}, }, Outputs: []modulereader.OutputInfo{}, } diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 665d37b616..cd57bc9e5e 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -18,7 +18,6 @@ import ( "errors" "fmt" "regexp" - "strings" "hpc-toolkit/pkg/modulereader" @@ -121,8 +120,8 @@ func (dc *DeploymentConfig) expandBackends() { } } -func getModuleInputMap(inputs []modulereader.VarInfo) map[string]string { - modInputs := make(map[string]string) +func getModuleInputMap(inputs []modulereader.VarInfo) map[string]cty.Type { + modInputs := make(map[string]cty.Type) for _, input := range inputs { modInputs[input.Name] = input.Type } @@ -178,7 +177,7 @@ func useModule(mod *Module, use Module) { // skip settings that are not of list type, but already have a value // these were probably added by a previous call to this function - isList := strings.HasPrefix(inputType, "list") + isList := inputType.IsListType() if alreadySet && !isList { continue } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 4b91c0f80f..69a27d7aa7 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -77,10 +77,7 @@ func (s *zeroSuite) TestUseModule(c *C) { ID: "UsedModule", Source: "usedSource", } - varInfoNumber := modulereader.VarInfo{ - Name: "val1", - Type: "number", - } + varInfoNumber := modulereader.VarInfo{Name: "val1", Type: cty.Number} ref := ModuleRef("UsedModule", "val1").AsExpression().AsValue() { // Pass: No Inputs, No Outputs @@ -152,7 +149,7 @@ func (s *zeroSuite) TestUseModule(c *C) { { // Pass: Single Input/Output match, input is list, not already set mod := Module{ID: "lime", Source: "limeTree"} setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -169,7 +166,7 @@ func (s *zeroSuite) TestUseModule(c *C) { mod := Module{ID: "lime", Source: "limeTree"} mod.Settings.Set("val1", AsProductOfModuleUse(cty.TupleVal([]cty.Value{ref}), "other")) setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -187,7 +184,7 @@ func (s *zeroSuite) TestUseModule(c *C) { mod := Module{ID: "lime", Source: "limeTree"} mod.Settings.Set("val1", cty.TupleVal([]cty.Value{ref})) setTestModuleInfo(mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{Name: "val1", Type: "list"}}, + Inputs: []modulereader.VarInfo{{Name: "val1", Type: cty.List(cty.Number)}}, }) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{{Name: "val1"}}, @@ -220,9 +217,7 @@ func (s *MySuite) TestApplyUseModules(c *C) { setTestModuleInfo(using, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ - Name: "potato", - Type: "number", - }}}) + Name: "potato", Type: cty.Number}}}) setTestModuleInfo(used, modulereader.ModuleInfo{ Outputs: []modulereader.OutputInfo{ {Name: "potato"}}}) @@ -332,7 +327,7 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { setTestModuleInfo(*mod, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ Name: "gold", - Type: "string", + Type: cty.String, Required: true, }}, }) @@ -353,7 +348,7 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { setTestModuleInfo(*mod, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ Name: "gold", - Type: "string", + Type: cty.String, Required: false, }}, }) diff --git a/pkg/inspect/modules_test.go b/pkg/inspect/modules_test.go index 8e5c1bbba6..4d6f373aeb 100644 --- a/pkg/inspect/modules_test.go +++ b/pkg/inspect/modules_test.go @@ -23,6 +23,8 @@ import ( "strings" "testing" + "github.com/hashicorp/hcl/v2/ext/typeexpr" + "github.com/zclconf/go-cty/cty" "golang.org/x/exp/slices" ) @@ -121,7 +123,7 @@ func checkInputType(t *testing.T, mod modInfo, input string, expected string) { t.Errorf("%s does not have input %s", mod.Source, input) } expected = modulereader.NormalizeType(expected) - got := modulereader.NormalizeType(i.Type) + got := typeexpr.TypeString(i.Type) if expected != got { t.Errorf("%s %s has unexpected type expected:\n%#v\ngot:\n%#v", mod.Source, input, expected, got) @@ -148,7 +150,7 @@ func TestNetworkStorage(t *testing.T) { for _, mod := range notEmpty(query(hasInput("network_storage")), t) { i, _ := mod.Input("network_storage") - got := modulereader.NormalizeType(i.Type) + got := typeexpr.TypeString(i.Type) if got != obj && got != lst { t.Errorf("%s `network_storage` has unexpected type expected:\n%#v\nor\n%#v\ngot:\n%#v", mod.Source, obj, lst, got) @@ -189,7 +191,7 @@ func TestMetadataInjectModuleId(t *testing.T) { if !ok { t.Fatalf("has no input %q", gm.InjectModuleId) } - if in.Type != "string" { + if in.Type != cty.String { t.Errorf("%q type is not a string, but %q", gm.InjectModuleId, in.Type) } }) diff --git a/pkg/modulereader/hcl_utils.go b/pkg/modulereader/hcl_utils.go index 892f5c35e3..5119aba662 100644 --- a/pkg/modulereader/hcl_utils.go +++ b/pkg/modulereader/hcl_utils.go @@ -61,9 +61,13 @@ func getHCLInfo(source string) (ModuleInfo, error) { var vars []VarInfo var outs []OutputInfo for _, v := range module.Variables { + ty, err := GetCtyType(v.Type) + if err != nil { + return ModuleInfo{}, fmt.Errorf("failed to parse type of variable %q: %w", v.Name, err) + } vInfo := VarInfo{ Name: v.Name, - Type: v.Type, + Type: ty, Description: v.Description, Default: v.Default, Required: v.Required, @@ -82,15 +86,27 @@ func getHCLInfo(source string) (ModuleInfo, error) { return ret, nil } -// Transforms HCL type string into cty.Type -func getCtyType(hclType string) (cty.Type, error) { +// Transforms Terraform type string into cty.Type +func GetCtyType(hclType string) (cty.Type, error) { + if hclType == "" { // treat empty type as `any` + // see https://developer.hashicorp.com/terraform/language/values/variables#type-constraints + return cty.DynamicPseudoType, nil + } expr, diags := hclsyntax.ParseExpression([]byte(hclType), "", hcl.Pos{Line: 1, Column: 1}) if diags.HasErrors() { - return cty.Type{}, diags + return cty.NilType, diags } - typ, diags := typeexpr.TypeConstraint(expr) + + switch hcl.ExprAsKeyword(expr) { + case "list": + return cty.List(cty.DynamicPseudoType), nil + case "map": + return cty.Map(cty.DynamicPseudoType), nil + } + + typ, _, diags := typeexpr.TypeConstraintWithDefaults(expr) if diags.HasErrors() { - return cty.Type{}, diags + return cty.NilType, diags } return typ, nil } @@ -104,7 +120,7 @@ func getCtyType(hclType string) (cty.Type, error) { // // This method is fail-safe, if error arises passed type will be returned without changes. func NormalizeType(hclType string) string { - ctyType, err := getCtyType(hclType) + ctyType, err := GetCtyType(hclType) if err != nil { logging.Error("Failed to parse HCL type='%s', got %v", hclType, err) return hclType diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 1390756e66..be054563a6 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -25,13 +25,14 @@ import ( "path" "github.com/hashicorp/go-getter" + "github.com/zclconf/go-cty/cty" "gopkg.in/yaml.v3" ) // VarInfo stores information about a module input variables type VarInfo struct { Name string - Type string + Type cty.Type Description string Default interface{} Required bool diff --git a/pkg/modulereader/resreader_test.go b/pkg/modulereader/resreader_test.go index a33a17da74..2db972b52f 100644 --- a/pkg/modulereader/resreader_test.go +++ b/pkg/modulereader/resreader_test.go @@ -21,6 +21,7 @@ import ( "path/filepath" "testing" + "github.com/zclconf/go-cty/cty" . "gopkg.in/check.v1" "gopkg.in/yaml.v3" ) @@ -88,7 +89,7 @@ func (s *MySuite) TestGetModuleInfo_Embedded(c *C) { c.Check(mi, DeepEquals, ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{ @@ -136,7 +137,7 @@ func (s *MySuite) TestGetModuleInfo_Local(c *C) { c.Check(mi, DeepEquals, ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{ @@ -190,7 +191,7 @@ func (s *MySuite) TestGetInfo_TFReder(c *C) { info, err := reader.GetInfo(s.terraformDir) c.Assert(err, IsNil) c.Check(info, DeepEquals, ModuleInfo{ - Inputs: []VarInfo{{Name: "test_variable", Type: "string", Description: "This is just a test", Required: true}}, + Inputs: []VarInfo{{Name: "test_variable", Type: cty.String, Description: "This is just a test", Required: true}}, Outputs: []OutputInfo{{Name: "test_output", Description: "This is just a test"}}, }) @@ -201,7 +202,7 @@ func (s *MySuite) TestGetInfo_PackerReader(c *C) { exp := ModuleInfo{ Inputs: []VarInfo{{ Name: "test_variable", - Type: "string", + Type: cty.String, Description: "This is just a test", Required: true}}} diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 2c85fe36b6..1d63ac989b 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -27,6 +27,8 @@ import ( "strings" "testing" + "github.com/google/go-cmp/cmp" + "github.com/hashicorp/hcl/v2/ext/typeexpr" "github.com/hashicorp/hcl/v2/hclwrite" "github.com/spf13/afero" "github.com/zclconf/go-cty/cty" @@ -234,81 +236,31 @@ func (s *MySuite) TestRestoreTfState(c *C) { c.Check(err, IsNil) } -func (s *zeroSuite) TestGetTypeTokens(c *C) { - // Success Integer - tok := getTypeTokens(cty.NumberIntVal(-1)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberIntVal(0)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberIntVal(1)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - // Success Float - tok = getTypeTokens(cty.NumberFloatVal(-99.9)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - tok = getTypeTokens(cty.NumberFloatVal(99.9)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("number"))) - - // Success String - tok = getTypeTokens(cty.StringVal("Lorum")) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("string"))) - - tok = getTypeTokens(cty.StringVal("")) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("string"))) - - // Success Bool - tok = getTypeTokens(cty.BoolVal(true)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("bool"))) - - tok = getTypeTokens(cty.BoolVal(false)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("bool"))) - - // Success tuple - tok = getTypeTokens(cty.TupleVal([]cty.Value{})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - tok = getTypeTokens(cty.TupleVal([]cty.Value{cty.StringVal("Lorum")})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - // Success list - tok = getTypeTokens(cty.ListVal([]cty.Value{cty.StringVal("Lorum")})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("list"))) - - // Success object - tok = getTypeTokens(cty.ObjectVal(map[string]cty.Value{})) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - val := cty.ObjectVal(map[string]cty.Value{"Lorum": cty.StringVal("Ipsum")}) - tok = getTypeTokens(val) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - // Success Map - val = cty.MapVal(map[string]cty.Value{"Lorum": cty.StringVal("Ipsum")}) - tok = getTypeTokens(val) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) - - // Success any - tok = getTypeTokens(cty.NullVal(cty.DynamicPseudoType)) - c.Assert(tok, HasLen, 1) - c.Assert(string(tok[0].Bytes), Equals, string([]byte("any"))) +func TestGetTypeTokensRelaxed(t *testing.T) { + type test struct { + input cty.Type + want string + } + tests := []test{ + {cty.Number, "number"}, + {cty.String, "string"}, + {cty.Bool, "bool"}, + {cty.Tuple([]cty.Type{}), "list(any)"}, + {cty.Tuple([]cty.Type{cty.String}), "list(any)"}, + {cty.List(cty.String), "list(any)"}, + {cty.Object(map[string]cty.Type{}), "any"}, + {cty.Object(map[string]cty.Type{"Lorum": cty.String}), "any"}, + {cty.Map(cty.String), "any"}, + {cty.DynamicPseudoType, "any"}, + } + for _, tc := range tests { + t.Run(typeexpr.TypeString(tc.input), func(t *testing.T) { + got := string(getTypeTokens(relaxVarType(tc.input)).Bytes()) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } } func (s *MySuite) TestCreateBaseFile(c *C) { diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index 3db6713d6d..563b15d015 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -120,18 +120,18 @@ func writeTfvars(vars map[string]cty.Value, dst string) error { return err } -func getHclType(t cty.Type) string { +func relaxVarType(t cty.Type) cty.Type { if t.IsPrimitiveType() { - return typeexpr.TypeString(t) + return t } if t.IsListType() || t.IsTupleType() || t.IsSetType() { - return "list" + return cty.List(cty.DynamicPseudoType) // list of any } - return typeexpr.TypeString(cty.DynamicPseudoType) // any + return cty.DynamicPseudoType // any } -func getTypeTokens(v cty.Value) hclwrite.Tokens { - return simpleTokens(getHclType(v.Type())) +func getTypeTokens(ty cty.Type) hclwrite.Tokens { + return simpleTokens(typeexpr.TypeString(ty)) } func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, dst string) error { @@ -143,13 +143,11 @@ func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, var inputs []modulereader.VarInfo for k, v := range vars { - typeStr := getHclType(v.Type()) - newInput := modulereader.VarInfo{ + inputs = append(inputs, modulereader.VarInfo{ Name: k, - Type: typeStr, + Type: relaxVarType(v.Type()), Description: fmt.Sprintf("Toolkit deployment variable: %s", k), - } - inputs = append(inputs, newInput) + }) } inputs = append(inputs, extraVars...) slices.SortFunc(inputs, func(i, j modulereader.VarInfo) int { return strings.Compare(i.Name, j.Name) }) @@ -164,7 +162,7 @@ func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, hclBlock := hclBody.AppendNewBlock("variable", []string{k.Name}) blockBody := hclBlock.Body() blockBody.SetAttributeValue("description", cty.StringVal(k.Description)) - blockBody.SetAttributeRaw("type", simpleTokens(k.Type)) + blockBody.SetAttributeRaw("type", getTypeTokens(k.Type)) } // Write file @@ -446,7 +444,7 @@ func FindIntergroupVariables(group config.DeploymentGroup, bp config.Blueprint) n := config.AutomaticOutputName(r.Name, r.Module) res[r] = modulereader.VarInfo{ Name: n, - Type: getHclType(cty.DynamicPseudoType), + Type: cty.DynamicPseudoType, Description: "Automatically generated input from previous groups (ghpc import-inputs --help)", Required: true, } From 57303a96ce0e91bca83ae28d9d15b428f21f6504 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 4 Jan 2024 12:56:58 -0800 Subject: [PATCH 009/151] fix: header was over-indented --- modules/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/README.md b/modules/README.md index 769920aa4e..a12d92fcb6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -437,7 +437,7 @@ are supported, `git::https://` for HTTPS or `git::git@github.com` for SSH. Additional formatting and features after `git::` are identical to that of the [GitHub Modules](#github-modules) described above. -##### Google Cloud Storage Modules +#### Google Cloud Storage Modules To use a Terraform module available in a Google Cloud Storage bucket, set the source to a URL with the special `gcs::` prefix, followed by a [GCS bucket object URL](https://cloud.google.com/storage/docs/request-endpoints#typical). From e0ebadde651a79764178bee2c3ae7408215700bf Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 4 Jan 2024 20:19:46 -0800 Subject: [PATCH 010/151] Check if supplied value matches module variable type (#2089) --- pkg/config/expand.go | 30 +++++++++++++++++++++- pkg/config/expression_test.go | 1 + pkg/config/validator_test.go | 6 +++++ pkg/config/yaml.go | 37 +++++++++++++++++---------- pkg/config/yaml_test.go | 7 ++--- pkg/modulereader/metadata.go | 1 - pkg/modulewriter/modulewriter_test.go | 4 +-- pkg/validators/validators_test.go | 24 +++++++++-------- 8 files changed, 79 insertions(+), 31 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index cd57bc9e5e..e1750cb183 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -23,6 +23,7 @@ import ( "github.com/agext/levenshtein" "github.com/zclconf/go-cty/cty" + "github.com/zclconf/go-cty/cty/convert" "golang.org/x/exp/maps" "golang.org/x/exp/slices" ) @@ -87,11 +88,38 @@ func validateModuleInputs(mp modulePath, m Module, bp Blueprint) error { continue } - // TODO: Check set value and input dtypes convertability + errs.At(ip, checkInputValueMatchesType(m.Settings.Get(input.Name), input, bp)) } return errs.OrNil() } +func attemptEvalModuleInput(val cty.Value, bp Blueprint) (cty.Value, bool) { + v, err := evalValue(val, bp) + // there could be a legitimate reasons for it. + // e.g. use of modules output or unsupported (by ghpc) functions + // TODO: + // * substitute module outputs with an UnknownValue + // * skip if uses functions with side-effects, e.g. `file` + // * add implementation of all pure terraform functions + // * add positive selection for eval-errors to bubble up + return v, err == nil +} + +func checkInputValueMatchesType(val cty.Value, input modulereader.VarInfo, bp Blueprint) error { + v, ok := attemptEvalModuleInput(val, bp) + if !ok || input.Type == cty.NilType { + return nil // skip, can do nothing + } + // cty does panic on some edge cases, e.g. (cty.NilVal) + // we don't anticipate any of those, but just in case, catch panic and swallow it + defer func() { recover() }() + // TODO: consider returning error (not panic) or logging warning + if _, err := convert.Convert(v, input.Type); err != nil { + return fmt.Errorf("unsuitable value for %q: %w", input.Name, err) + } + return nil +} + func (dc *DeploymentConfig) expandBackends() { // 1. DEFAULT: use TerraformBackend configuration (if supplied) in each // resource group diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index be6206a1ee..74703a67ff 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -154,6 +154,7 @@ func TestTokensForValueNoLiteral(t *testing.T) { "ba": cty.NumberIntVal(56), })}), "pony.zebra": cty.NilVal, + "zanzibar": cty.NullVal(cty.DynamicPseudoType), }) want := hclwrite.NewEmptyFile() want.Body().AppendUnstructuredTokens(hclwrite.TokensForValue(val)) diff --git a/pkg/config/validator_test.go b/pkg/config/validator_test.go index 95d6e5498a..c79df42096 100644 --- a/pkg/config/validator_test.go +++ b/pkg/config/validator_test.go @@ -37,6 +37,12 @@ func (s *zeroSuite) TestValidateVars(c *C) { c.Check(validateVars(vars), NotNil) } + { // Fail: Null value + vars := Dict{base} + vars.Set("fork", cty.NullVal(cty.String)) + c.Check(validateVars(vars), NotNil) + } + { // Fail: labels not a map vars := Dict{base} vars.Set("labels", cty.StringVal("a_string")) diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index c3d912335b..4d0fde2d7b 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -219,14 +219,23 @@ func (ms *ModuleIDs) UnmarshalYAML(n *yaml.Node) error { // YamlValue is wrapper around cty.Value to handle YAML unmarshal. type YamlValue struct { - v cty.Value + v cty.Value // do not use this field directly, use Wrap() and Unwrap() instead } // Unwrap returns wrapped cty.Value. func (y YamlValue) Unwrap() cty.Value { + if y.v == cty.NilVal { + // we can't use 0-value of cty.Value (NilVal) + // instead it should be a proper null(any) value + return cty.NullVal(cty.DynamicPseudoType) + } return y.v } +func (y *YamlValue) Wrap(v cty.Value) { + y.v = v +} + // UnmarshalYAML implements custom YAML unmarshaling. func (y *YamlValue) UnmarshalYAML(n *yaml.Node) error { var err error @@ -252,24 +261,26 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { if err != nil { return fmt.Errorf("line %d: %w", n.Line, err) } - if y.v, err = gocty.ToCtyValue(s, ty); err != nil { + v, err := gocty.ToCtyValue(s, ty) + if err != nil { return err } + y.Wrap(v) - if l, is := IsYamlExpressionLiteral(y.v); is { // HCL literal + if l, is := IsYamlExpressionLiteral(y.Unwrap()); is { // HCL literal var e Expression if e, err = ParseExpression(l); err != nil { // TODO: point to exact location within expression, see Diagnostic.Subject return fmt.Errorf("line %d: %w", n.Line, err) } - y.v = e.AsValue() - } else if y.v.Type() == cty.String && hasVariable(y.v.AsString()) { // "simple" variable - e, err := SimpleVarToExpression(y.v.AsString()) + y.Wrap(e.AsValue()) + } else if y.Unwrap().Type() == cty.String && hasVariable(y.Unwrap().AsString()) { // "simple" variable + e, err := SimpleVarToExpression(y.Unwrap().AsString()) if err != nil { // TODO: point to exact location within expression, see Diagnostic.Subject return fmt.Errorf("line %d: %w", n.Line, err) } - y.v = e.AsValue() + y.Wrap(e.AsValue()) } return nil } @@ -281,9 +292,9 @@ func (y *YamlValue) unmarshalObject(n *yaml.Node) error { } mv := map[string]cty.Value{} for k, y := range my { - mv[k] = y.v + mv[k] = y.Unwrap() } - y.v = cty.ObjectVal(mv) + y.Wrap(cty.ObjectVal(mv)) return nil } @@ -294,9 +305,9 @@ func (y *YamlValue) unmarshalTuple(n *yaml.Node) error { } lv := []cty.Value{} for _, y := range ly { - lv = append(lv, y.v) + lv = append(lv, y.Unwrap()) } - y.v = cty.TupleVal(lv) + y.Wrap(cty.TupleVal(lv)) return nil } @@ -306,11 +317,11 @@ func (d *Dict) UnmarshalYAML(n *yaml.Node) error { if err := n.Decode(&v); err != nil { return err } - ty := v.v.Type() + ty := v.Unwrap().Type() if !ty.IsObjectType() { return fmt.Errorf("line %d: must be a mapping, got %s", n.Line, ty.FriendlyName()) } - for k, w := range v.v.AsValueMap() { + for k, w := range v.Unwrap().AsValueMap() { d.Set(k, w) } return nil diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go index f65f4d6cc3..75aa8242c7 100644 --- a/pkg/config/yaml_test.go +++ b/pkg/config/yaml_test.go @@ -337,10 +337,11 @@ b: null c: ~ d: "null" ` + anyNull := cty.NullVal(cty.DynamicPseudoType) want := cty.ObjectVal(map[string]cty.Value{ - "a": cty.NilVal, - "b": cty.NilVal, - "c": cty.NilVal, + "a": anyNull, + "b": anyNull, + "c": anyNull, "d": cty.StringVal("null"), }) diff --git a/pkg/modulereader/metadata.go b/pkg/modulereader/metadata.go index 3748d3872d..15069f6c11 100644 --- a/pkg/modulereader/metadata.go +++ b/pkg/modulereader/metadata.go @@ -54,7 +54,6 @@ type MetadataGhpc struct { func GetMetadata(source string) (Metadata, error) { var err error var data []byte - // TODO: use bpmetadata.UnmarshalMetadata, it performs some additional checks filePath := filepath.Join(source, "metadata.yaml") switch { diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 1d63ac989b..39e599ccc4 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -58,8 +58,8 @@ func (s *MySuite) getDeploymentConfigForTest() config.DeploymentConfig { Kind: config.TerraformKind, ID: "testModule", Settings: config.NewDict(map[string]cty.Value{ - "deployment_name": cty.NilVal, - "project_id": cty.NilVal, + "deployment_name": cty.NullVal(cty.String), + "project_id": cty.NullVal(cty.String), }), Outputs: []modulereader.OutputInfo{ { diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index d9389f19a6..16d46ca519 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -32,37 +32,39 @@ func Test(t *testing.T) { } func (s *MySuite) TestCheckInputs(c *C) { + dummy := cty.NullVal(cty.String) + { // OK: Inputs is equal to required inputs without regard to ordering i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal}) + "in0": dummy, + "in1": dummy}) c.Check(checkInputs(i, []string{"in0", "in1"}), IsNil) c.Check(checkInputs(i, []string{"in1", "in0"}), IsNil) } { // FAIL: inputs are a proper subset of required inputs i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal}) + "in0": dummy, + "in1": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, NotNil) } { // FAIL: inputs intersect with required inputs but are not a proper subset i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in3": cty.NilVal}) + "in0": dummy, + "in1": dummy, + "in3": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, NotNil) } { // FAIL inputs are a proper superset of required inputs i := config.NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in2": cty.NilVal, - "in3": cty.NilVal}) + "in0": dummy, + "in1": dummy, + "in2": dummy, + "in3": dummy}) err := checkInputs(i, []string{"in0", "in1", "in2"}) c.Check(err, ErrorMatches, "only 3 inputs \\[in0 in1 in2\\] should be provided") } From ae7a00808f36ea7d6a010d3b8dffb79340a13664 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 5 Jan 2024 10:02:46 -0800 Subject: [PATCH 011/151] Add spelling hints for global vars and outputs (#2082) --- pkg/config/config.go | 26 ++++++++++---------------- pkg/config/config_test.go | 6 +++--- pkg/config/errors.go | 1 - pkg/config/expand.go | 36 +++++++++++++++++++++++++----------- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index c5bab10c64..40809bd7a9 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -101,25 +101,19 @@ func (bp *Blueprint) Module(id ModuleID) (*Module, error) { return mod, nil } -// SuggestModuleIDHint return a correct spelling of given ModuleID id if one -// is close enough (based on maxHintDist) -func (bp Blueprint) SuggestModuleIDHint(id ModuleID) (string, bool) { - clMod := "" - minDist := -1 - bp.WalkModules(func(m *Module) error { - dist := levenshtein.Distance(string(m.ID), string(id), nil) - if minDist == -1.0 || dist < minDist { - minDist = dist - clMod = string(m.ID) +func hintSpelling(s string, dict []string, err error) error { + best, minDist := "", maxHintDist+1 + for _, w := range dict { + d := levenshtein.Distance(s, w, nil) + if d < minDist { + best, minDist = w, d } - return nil - }) - - if clMod != "" && minDist <= maxHintDist { - return clMod, true } + if minDist <= maxHintDist { + return HintError{fmt.Sprintf("did you mean %q?", best), err} + } + return err - return "", false } // ModuleGroup returns the group containing the module diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index c1aa89d8a7..03a374ef4d 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -831,17 +831,17 @@ func (s *zeroSuite) TestValidateModuleSettingReference(c *C) { // FAIL. get global hint mod := ModuleID("var") unkModErr := UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{"Did you mean \"vars\"?", unkModErr}), Equals, true) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{`did you mean "vars"?`, unkModErr}), Equals, true) // FAIL. get module ID hint mod = ModuleID("pkp") unkModErr = UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, true) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("did you mean %q?", string(pkr.ID)), unkModErr}), Equals, true) // FAIL. get no hint mod = ModuleID("test") unkModErr = UnknownModuleError{mod} - c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, false) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("did you mean %q?", string(pkr.ID)), unkModErr}), Equals, false) c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), unkModErr), Equals, true) } diff --git a/pkg/config/errors.go b/pkg/config/errors.go index d4dcdeb248..18d7bb9be6 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -153,7 +153,6 @@ const ( errMsgInvalidVar = string("invalid variable definition in") errMsgVarNotFound = string("could not find source of variable") errMsgIntergroupOrder = string("references to outputs from other groups must be to earlier groups") - errMsgNoOutput = string("output not found for a variable") errMsgCannotUsePacker = string("Packer modules cannot be used by other modules") errMsgDuplicateGroup = string("group names must be unique") errMsgDuplicateID = string("module IDs must be unique") diff --git a/pkg/config/expand.go b/pkg/config/expand.go index e1750cb183..bd6001e376 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -21,7 +21,6 @@ import ( "hpc-toolkit/pkg/modulereader" - "github.com/agext/levenshtein" "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/convert" "golang.org/x/exp/maps" @@ -340,10 +339,12 @@ func AutomaticOutputName(outputName string, moduleID ModuleID) string { func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { to, err := bp.Module(toID) if err != nil { - if hint, ok := bp.SuggestModuleIDHint(toID); ok { - return HintError{fmt.Sprintf("Did you mean \"%s\"?", hint), err} - } - return err + mods := []string{} + bp.WalkModules(func(m *Module) error { + mods = append(mods, string(m.ID)) + return nil + }) + return hintSpelling(string(toID), mods, err) } if to.Kind == PackerKind { @@ -368,15 +369,22 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error // simplest case to evaluate is a deployment variable's existence if r.GlobalVar { if !bp.Vars.Has(r.Name) { - return fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) + err := fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) + vars := maps.Keys(bp.Vars.Items()) + return hintSpelling(r.Name, vars, err) } return nil } if err := validateModuleReference(bp, mod, r.Module); err != nil { var unkModErr UnknownModuleError - if errors.As(err, &unkModErr) && levenshtein.Distance(string(unkModErr.ID), "vars", nil) <= 2 { - return HintError{"Did you mean \"vars\"?", unkModErr} + if errors.As(err, &unkModErr) { + hints := []string{"vars"} + bp.WalkModules(func(m *Module) error { + hints = append(hints, string(m.ID)) + return nil + }) + return hintSpelling(string(unkModErr.ID), hints, unkModErr) } return err } @@ -385,9 +393,15 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if err != nil { return err } - found := slices.ContainsFunc(mi.Outputs, func(o modulereader.OutputInfo) bool { return o.Name == r.Name }) - if !found { - return fmt.Errorf("%s: module %s did not have output %s", errMsgNoOutput, tm.ID, r.Name) + + outputs := []string{} + for _, o := range mi.Outputs { + outputs = append(outputs, o.Name) + } + + if !slices.Contains(outputs, r.Name) { + err := fmt.Errorf("module %q does not have output %q", tm.ID, r.Name) + return hintSpelling(r.Name, outputs, err) } return nil } From 8895b44e7c7269a4ad18a5e7114cbae0d30f843d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 5 Jan 2024 10:05:55 -0800 Subject: [PATCH 012/151] Point ref errors to a location within nested object (#2081) --- pkg/config/config.go | 22 +++++++++------------- pkg/config/expand.go | 2 +- pkg/config/expression.go | 11 +++++------ 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 40809bd7a9..87b8d30319 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -302,7 +302,7 @@ func (m Module) ListUnusedModules() ModuleIDs { // GetUsedDeploymentVars returns a list of deployment vars used in the given value func GetUsedDeploymentVars(val cty.Value) []string { res := []string{} - for _, ref := range valueReferences(val) { + for ref := range valueReferences(val) { if ref.GlobalVar { res = append(res, ref.Name) } @@ -647,9 +647,10 @@ func (bp *Blueprint) WalkModules(walker func(*Module) error) error { func validateModuleSettingReferences(p modulePath, m Module, bp Blueprint) error { errs := Errors{} for k, v := range m.Settings.Items() { - for _, r := range valueReferences(v) { - // TODO: add a cty.Path suffix to the errors path for better location - errs.At(p.Settings.Dot(k), validateModuleSettingReference(bp, m, r)) + for r, rp := range valueReferences(v) { + errs.At( + p.Settings.Dot(k).Cty(rp), + validateModuleSettingReference(bp, m, r)) } } return errs.OrNil() @@ -678,18 +679,13 @@ func (bp *Blueprint) evalVars() (Dict, error) { dfs = func(n string) error { used[n] = 1 // put on stack v := bp.Vars.Get(n) - for _, ref := range valueReferences(v) { + for ref, rp := range valueReferences(v) { + p := Root.Vars.Dot(n).Cty(rp) if !ref.GlobalVar { - return BpError{ - Root.Vars.Dot(n), - fmt.Errorf("non-global variable %q referenced in expression", ref.Name), - } + return BpError{p, fmt.Errorf("non-global variable %q referenced in expression", ref.Name)} } if used[ref.Name] == 1 { - return BpError{ - Root.Vars.Dot(n), - fmt.Errorf("cyclic dependency detected: %q -> %q", n, ref.Name), - } + return BpError{p, fmt.Errorf("cyclic dependency detected: %q -> %q", n, ref.Name)} } if used[ref.Name] == 0 { if err := dfs(ref.Name); err != nil { diff --git a/pkg/config/expand.go b/pkg/config/expand.go index bd6001e376..2a0c22a34b 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -431,7 +431,7 @@ func (dg DeploymentGroup) FindAllIntergroupReferences(bp Blueprint) []Reference func FindIntergroupReferences(v cty.Value, mod Module, bp Blueprint) []Reference { g := bp.ModuleGroupOrDie(mod.ID) res := []Reference{} - for _, r := range valueReferences(v) { + for r := range valueReferences(v) { if !r.GlobalVar && bp.ModuleGroupOrDie(r.Module).Name != g.Name { res = append(res, r) } diff --git a/pkg/config/expression.go b/pkg/config/expression.go index 85a9e58217..2bf4be5ddb 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -25,7 +25,6 @@ import ( "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/function" "github.com/zclconf/go-cty/cty/function/stdlib" - "golang.org/x/exp/maps" ) // Reference is data struct that represents a reference to a variable. @@ -422,17 +421,17 @@ func functions() map[string]function.Function { } } -func valueReferences(v cty.Value) []Reference { - r := map[Reference]bool{} - cty.Walk(v, func(_ cty.Path, v cty.Value) (bool, error) { +func valueReferences(v cty.Value) map[Reference]cty.Path { + r := map[Reference]cty.Path{} + cty.Walk(v, func(p cty.Path, v cty.Value) (bool, error) { if e, is := IsExpressionValue(v); is { for _, ref := range e.References() { - r[ref] = true + r[ref] = p } } return true, nil }) - return maps.Keys(r) + return r } func evalValue(v cty.Value, bp Blueprint) (cty.Value, error) { From cfd92e7318a9bcec0de57ee56db7633251e67087 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 5 Jan 2024 13:39:43 -0800 Subject: [PATCH 013/151] Refactor `Blueprint.WalkModules` (#2094) * Add safe-version to avoid useless `return nil`; * Supply `ModulePath` to the "dangerous" version; * Use `WalkModules` instead of nested for-loops in few cases. --- pkg/config/config.go | 25 +++++++++++++---------- pkg/config/expand.go | 41 ++++++++++++++------------------------ pkg/config/expand_test.go | 19 +----------------- pkg/config/path.go | 4 ++-- pkg/config/validate.go | 6 +++--- pkg/validators/cloud.go | 3 +-- pkg/validators/semantic.go | 17 ++++++---------- 7 files changed, 43 insertions(+), 72 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 87b8d30319..a6a2578a18 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -89,11 +89,10 @@ func (g DeploymentGroup) Kind() ModuleKind { // Module return the module with the given ID func (bp *Blueprint) Module(id ModuleID) (*Module, error) { var mod *Module - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { if m.ID == id { mod = m } - return nil }) if mod == nil { return nil, UnknownModuleError{id} @@ -316,9 +315,8 @@ func (bp Blueprint) ListUnusedVariables() []string { ns := map[string]cty.Value{ "vars": bp.Vars.AsObject(), } - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { ns["module_"+string(m.ID)] = m.Settings.AsObject() - return nil }) for _, v := range bp.Validators { ns["validator_"+v.Validator] = v.Inputs.AsObject() @@ -392,11 +390,10 @@ func (dc DeploymentConfig) ExportBlueprint(outputFilename string) error { // addKindToModules sets the kind to 'terraform' when empty. func (bp *Blueprint) addKindToModules() { - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { if m.Kind == UnknownKind { m.Kind = TerraformKind } - return nil }) } @@ -434,7 +431,7 @@ func checkModulesAndGroups(bp Blueprint) error { // validateModuleUseReferences verifies that any used modules exist and // are in the correct group -func validateModuleUseReferences(p modulePath, mod Module, bp Blueprint) error { +func validateModuleUseReferences(p ModulePath, mod Module, bp Blueprint) error { errs := Errors{} for iu, used := range mod.Use { errs.At(p.Use.At(iu), validateModuleReference(bp, mod, used)) @@ -630,12 +627,13 @@ func IsProductOfModuleUse(v cty.Value) []ModuleID { } // WalkModules walks all modules in the blueprint and calls the walker function -func (bp *Blueprint) WalkModules(walker func(*Module) error) error { +func (bp *Blueprint) WalkModules(walker func(ModulePath, *Module) error) error { for ig := range bp.DeploymentGroups { g := &bp.DeploymentGroups[ig] for im := range g.Modules { + p := Root.Groups.At(ig).Modules.At(im) m := &g.Modules[im] - if err := walker(m); err != nil { + if err := walker(p, m); err != nil { return err } } @@ -643,8 +641,15 @@ func (bp *Blueprint) WalkModules(walker func(*Module) error) error { return nil } +func (bp *Blueprint) WalkModulesSafe(walker func(ModulePath, *Module)) { + bp.WalkModules(func(p ModulePath, m *Module) error { + walker(p, m) + return nil + }) +} + // validate every module setting in the blueprint containing a reference -func validateModuleSettingReferences(p modulePath, m Module, bp Blueprint) error { +func validateModuleSettingReferences(p ModulePath, m Module, bp Blueprint) error { errs := Errors{} for k, v := range m.Settings.Items() { for r, rp := range valueReferences(v) { diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 2a0c22a34b..deccfe8e93 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -50,9 +50,7 @@ func (dc *DeploymentConfig) expand() error { return err } - if err := dc.applyGlobalVariables(); err != nil { - return err - } + dc.applyGlobalVariables() if err := validateInputsAllModules(dc.Config); err != nil { return err @@ -64,16 +62,13 @@ func (dc *DeploymentConfig) expand() error { func validateInputsAllModules(bp Blueprint) error { errs := Errors{} - for ig, g := range bp.DeploymentGroups { - for im, m := range g.Modules { - p := Root.Groups.At(ig).Modules.At(im) - errs.Add(validateModuleInputs(p, m, bp)) - } - } + bp.WalkModulesSafe(func(p ModulePath, m *Module) { + errs.Add(validateModuleInputs(p, *m, bp)) + }) return errs.OrNil() } -func validateModuleInputs(mp modulePath, m Module, bp Blueprint) error { +func validateModuleInputs(mp ModulePath, m Module, bp Blueprint) error { mi := m.InfoOrDie() errs := Errors{} for _, input := range mi.Inputs { @@ -224,7 +219,7 @@ func useModule(mod *Module, use Module) { // applyUseModules applies variables from modules listed in the "use" field // when/if applicable func (dc *DeploymentConfig) applyUseModules() error { - return dc.Config.WalkModules(func(m *Module) error { + return dc.Config.WalkModules(func(_ ModulePath, m *Module) error { for _, u := range m.Use { used, err := dc.Config.Module(u) if err != nil { // should never happen @@ -260,9 +255,8 @@ func (dc *DeploymentConfig) combineLabels() { gl := mergeMaps(defaults, vars.Get(labels).AsValueMap()) vars.Set(labels, cty.ObjectVal(gl)) - dc.Config.WalkModules(func(mod *Module) error { + dc.Config.WalkModulesSafe(func(_ ModulePath, mod *Module) { combineModuleLabels(mod, *dc) - return nil }) } @@ -297,7 +291,7 @@ func mergeMaps(ms ...map[string]cty.Value) map[string]cty.Value { return r } -func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { +func (bp Blueprint) applyGlobalVarsInModule(mod *Module) { mi := mod.InfoOrDie() for _, input := range mi.Inputs { // Module setting exists? Nothing more needs to be done. @@ -316,14 +310,13 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { mod.Settings.Set(input.Name, cty.StringVal(string(mod.ID))) } } - return nil } // applyGlobalVariables takes any variables defined at the global level and // applies them to module settings if not already set. -func (dc *DeploymentConfig) applyGlobalVariables() error { - return dc.Config.WalkModules(func(mod *Module) error { - return dc.Config.applyGlobalVarsInModule(mod) +func (dc *DeploymentConfig) applyGlobalVariables() { + dc.Config.WalkModulesSafe(func(_ ModulePath, m *Module) { + dc.Config.applyGlobalVarsInModule(m) }) } @@ -340,9 +333,8 @@ func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { to, err := bp.Module(toID) if err != nil { mods := []string{} - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { mods = append(mods, string(m.ID)) - return nil }) return hintSpelling(string(toID), mods, err) } @@ -380,9 +372,8 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error var unkModErr UnknownModuleError if errors.As(err, &unkModErr) { hints := []string{"vars"} - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { hints = append(hints, string(m.ID)) - return nil }) return hintSpelling(string(unkModErr.ID), hints, unkModErr) } @@ -442,15 +433,14 @@ func FindIntergroupReferences(v cty.Value, mod Module, bp Blueprint) []Reference // find all intergroup references and add them to source Module.Outputs func (bp *Blueprint) populateOutputs() { refs := map[Reference]bool{} - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { rs := FindIntergroupReferences(m.Settings.AsObject(), *m, *bp) for _, r := range rs { refs[r] = true } - return nil }) - bp.WalkModules(func(m *Module) error { + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { for r := range refs { if r.Module != m.ID { continue // find IGC references pointing to this module @@ -465,7 +455,6 @@ func (bp *Blueprint) populateOutputs() { }) } - return nil }) } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 69a27d7aa7..8becc02c5e 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -320,9 +320,6 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { dc := s.getDeploymentConfigForTest() mod := &dc.Config.DeploymentGroups[0].Modules[0] - // Test no inputs, none required - c.Check(dc.applyGlobalVariables(), IsNil) - // Test no inputs, one required, doesn't exist in globals setTestModuleInfo(*mod, modulereader.ModuleInfo{ Inputs: []modulereader.VarInfo{{ @@ -334,25 +331,11 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { // Test no input, one required, exists in globals dc.Config.Vars.Set("gold", cty.StringVal("val")) - c.Check(dc.applyGlobalVariables(), IsNil) + dc.applyGlobalVariables() c.Assert( mod.Settings.Get("gold"), DeepEquals, GlobalRef("gold").AsExpression().AsValue()) - - // Test one input, one required - mod.Settings.Set("reqVar", cty.StringVal("val")) - c.Assert(dc.applyGlobalVariables(), IsNil) - - // Test one input, none required, exists in globals - setTestModuleInfo(*mod, modulereader.ModuleInfo{ - Inputs: []modulereader.VarInfo{{ - Name: "gold", - Type: cty.String, - Required: false, - }}, - }) - c.Assert(dc.applyGlobalVariables(), IsNil) } func (s *zeroSuite) TestIsSimpleVariable(c *C) { diff --git a/pkg/config/path.go b/pkg/config/path.go index fef57d7b56..cf1a1df58b 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -156,10 +156,10 @@ type groupPath struct { basePath Name basePath `path:".group"` Backend backendPath `path:".terraform_backend"` - Modules arrayPath[modulePath] `path:".modules"` + Modules arrayPath[ModulePath] `path:".modules"` } -type modulePath struct { +type ModulePath struct { basePath Source basePath `path:".source"` Kind basePath `path:".kind"` diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 79435eec43..16929056a4 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -82,7 +82,7 @@ func validateVars(vars Dict) error { return errs.OrNil() } -func validateModule(p modulePath, m Module, bp Blueprint) error { +func validateModule(p ModulePath, m Module, bp Blueprint) error { // Source/Kind validations are required to pass to perform other validations if m.Source == "" { return BpError{p.Source, EmptyModuleSource} @@ -113,7 +113,7 @@ func validateModule(p modulePath, m Module, bp Blueprint) error { OrNil() } -func validateOutputs(p modulePath, mod Module, info modulereader.ModuleInfo) error { +func validateOutputs(p ModulePath, mod Module, info modulereader.ModuleInfo) error { errs := Errors{} outputs := info.GetOutputsAsMap() @@ -133,7 +133,7 @@ type moduleVariables struct { } func validateSettings( - p modulePath, + p ModulePath, mod Module, info modulereader.ModuleInfo) error { diff --git a/pkg/validators/cloud.go b/pkg/validators/cloud.go index 957e034d7a..7b5f6576f8 100644 --- a/pkg/validators/cloud.go +++ b/pkg/validators/cloud.go @@ -188,12 +188,11 @@ func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { return err } apis := map[string]bool{} - bp.WalkModules(func(m *config.Module) error { + bp.WalkModulesSafe(func(_ config.ModulePath, m *config.Module) { services := m.InfoOrDie().Metadata.Spec.Requirements.Services for _, api := range services { apis[api] = true } - return nil }) return TestApisEnabled(p, maps.Keys(apis)) } diff --git a/pkg/validators/semantic.go b/pkg/validators/semantic.go index c0192465a1..b6fc18eae1 100644 --- a/pkg/validators/semantic.go +++ b/pkg/validators/semantic.go @@ -26,19 +26,14 @@ func testModuleNotUsed(bp config.Blueprint, inputs config.Dict) error { return err } errs := config.Errors{} - for ig, g := range bp.DeploymentGroups { - for im, m := range g.Modules { - ums := m.ListUnusedModules() - p := config.Root.Groups.At(ig).Modules.At(im).Use - - for iu, u := range m.Use { - if slices.Contains(ums, u) { - errs.At(p.At(iu), fmt.Errorf(unusedModuleMsg, m.ID, u)) - } + bp.WalkModulesSafe(func(p config.ModulePath, m *config.Module) { + ums := m.ListUnusedModules() + for iu, u := range m.Use { + if slices.Contains(ums, u) { + errs.At(p.Use.At(iu), fmt.Errorf(unusedModuleMsg, m.ID, u)) } } - } - + }) return errs.OrNil() } From 0f4a5598db666946947b8281108fefcf155bc886 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jan 2024 11:10:28 +0000 Subject: [PATCH 014/151] Bump golang.org/x/sys from 0.15.0 to 0.16.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.15.0 to 0.16.0. - [Commits](https://github.com/golang/sys/compare/v0.15.0...v0.16.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index f47ea1ccce..ed4f891b8e 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.17.0 // indirect golang.org/x/net v0.19.0 // indirect golang.org/x/oauth2 v0.15.0 // indirect - golang.org/x/sys v0.15.0 + golang.org/x/sys v0.16.0 golang.org/x/text v0.14.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.8 // indirect diff --git a/go.sum b/go.sum index 2608c8e1c7..92f5bbfa52 100644 --- a/go.sum +++ b/go.sum @@ -737,8 +737,8 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 0da9a3803da070a873411765f9a010c1981b6719 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jan 2024 11:10:41 +0000 Subject: [PATCH 015/151] Bump google.golang.org/api from 0.154.0 to 0.155.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.154.0 to 0.155.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.154.0...v0.155.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 12 ++++++------ go.sum | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/go.mod b/go.mod index f47ea1ccce..a9b1b8434d 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,7 @@ require ( github.com/spf13/cobra v1.8.0 github.com/zclconf/go-cty v1.14.1 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f // indirect + google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -28,7 +28,7 @@ require ( github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.154.0 + google.golang.org/api v0.155.0 ) require ( @@ -53,8 +53,8 @@ require ( golang.org/x/sync v0.5.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) @@ -73,7 +73,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/s2a-go v0.1.7 // indirect - github.com/google/uuid v1.4.0 // indirect + github.com/google/uuid v1.5.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/hashicorp/go-safetemp v1.0.0 // indirect @@ -102,7 +102,7 @@ require ( golang.org/x/text v0.14.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.8 // indirect - google.golang.org/grpc v1.59.0 // indirect + google.golang.org/grpc v1.60.1 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 2608c8e1c7..96321651ef 100644 --- a/go.sum +++ b/go.sum @@ -353,8 +353,8 @@ github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4= -github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= +github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.0.0-20220520183353-fd19c99a87aa/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= github.com/googleapis/enterprise-certificate-proxy v0.1.0/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= github.com/googleapis/enterprise-certificate-proxy v0.2.0/go.mod h1:8C0jb7/mgJe/9KK8Lm7X9ctZC2t60YyIpYEI16jx0Qg= @@ -878,8 +878,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.154.0 h1:X7QkVKZBskztmpPKWQXgjJRPA2dJYrL6r+sYPRLj050= -google.golang.org/api v0.154.0/go.mod h1:qhSMkM85hgqiokIYsrRyKxrjfBeIhgl4Z2JmeRkYylc= +google.golang.org/api v0.155.0 h1:vBmGhCYs0djJttDNynWo44zosHlPvHmA0XiN2zP2DtA= +google.golang.org/api v0.155.0/go.mod h1:GI5qK5f40kCpHfPn6+YzGAByIKWv8ujFnmoWm7Igduk= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -990,12 +990,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f h1:Vn+VyHU5guc9KjB5KrjI2q0wCOWEOIh0OEsleqakHJg= -google.golang.org/genproto v0.0.0-20231120223509-83a465c0220f/go.mod h1:nWSwAFPb+qfNJXsoeO3Io7zf4tMSfN8EA8RlDA04GhY= -google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f h1:2yNACc1O40tTnrsbk9Cv6oxiW8pxI/pXj0wRtdlYmgY= -google.golang.org/genproto/googleapis/api v0.0.0-20231120223509-83a465c0220f/go.mod h1:Uy9bTZJqmfrw2rIBxgGLnamc78euZULUBrLZ9XTITKI= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4 h1:DC7wcm+i+P1rN3Ff07vL+OndGg5OhNddHyTA+ocPqYE= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4/go.mod h1:eJVxU6o+4G1PSczBr85xmyvSNYAKvAYgkub40YGomFM= +google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 h1:1hfbdAfFbkmpg41000wDVqr7jUpK/Yo+LPnIxxGzmkg= +google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3/go.mod h1:5RBcpGRxr25RbDzY5w+dmaqpSEvl8Gwl1x2CICf60ic= +google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3 h1:EWIeHfGuUf00zrVZGEgYFxok7plSAXBGcH7NNdMAWvA= +google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3/go.mod h1:k2dtGpRrbsSyKcNPKKI5sstZkrNCZwpU/ns96JoHbGg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0 h1:/jFB8jK5R3Sq3i/lmeZO0cATSzFfZaJq1J2Euan3XKU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0/go.mod h1:FUoWkonphQm3RhTS+kOEhF8h0iDpm4tdXolVCeZ9KKA= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1031,8 +1031,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk= -google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98= +google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= +google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From 1b2175f0de9e0d9b92f3ba3fa831590697eee4b3 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 4 Jan 2024 07:06:48 +0000 Subject: [PATCH 016/151] Update spack openfoam example to Slurm V6 --- docs/tutorials/openfoam/spack-openfoam.md | 59 +++++++++------------ docs/tutorials/openfoam/spack-openfoam.yaml | 41 +++++++------- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index b408ee15d4..4342ab8a5e 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the OpenFOAM application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 3 hr. to complete, -of which 2.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 3 hr. to complete, +of which 2.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -31,7 +31,7 @@ Once you have selected a project, click START. ## Enable APIs & Permissions In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `terraform apply` +deploy your HPC cluster. These will be caught when you perform `./ghpc create` but you can save time by enabling them now by running: @@ -84,13 +84,9 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster - * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition -[This diagram](https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/application_demo/docs/tutorials/application_demo.md#blueprint-diagram) -shows how the different modules relate to each other. - After you have inspected the file, use the ghpc binary to create a deployment folder by running: @@ -106,24 +102,19 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-openfoam/primary init -terraform -chdir=spack-openfoam/primary apply +./ghpc deploy spack-openfoam ``` -The `terraform apply` command will generate a _plan_ that describes the Google +You can also use below command to generate a _plan_ that describes the Google Cloud resources that will be deployed. -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: +```bash +terraform -chdir=spack-openfoam/primary init +terraform -chdir=spack-openfoam/primary apply +``` @@ -153,21 +144,21 @@ the final output from the above command: Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-openfoam-controller` and `slurm-spack-openfoam-login0`. If you don't +`slurm-spack-openfoam-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the login node +## Connecting to the controller node -Once the startup script has completed, connect to the login node. +Once the startup script has completed, connect to the controller node. -Use the following command to ssh into the login node from cloud shell: +Use the following command to ssh into the controller node from cloud shell: ```bash -gcloud compute ssh slurm-spack-openfoam-login0 --zone us-central1-c --project +gcloud compute ssh spackopenf-controller --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,15 +182,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-openfoam-login0` +1. Click on the `SSH` button associated with the `spackopenf-controller` instance. This will open a separate pop up window with a terminal into our newly - created Slurm login VM. + created Slurm controller VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm login node.** + **The commands below should be run on the Slurm controller node.** We will use the submission script (see line 122 of the blueprint) to submit a OpenFOAM job. @@ -247,7 +238,7 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `slurm-1.out` file has information on the run such as performance. You can -view this file by running the following command on the login node: +view this file by running the following command on the controller node: ```bash cat slurm-*.out @@ -268,9 +259,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the login node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash @@ -280,7 +271,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-openfoam/primary destroy -auto-approve +./ghpc deploy spack-openfoam ``` When complete you should see something like: diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 4512c0446d..c15851fe17 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -36,7 +36,7 @@ deployment_groups: source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack - spack_ref: v0.19.0 + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -99,8 +99,8 @@ deployment_groups: spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml - spack install gcc@9.3.0 %gcc@4.8.5 target=x86_64 - spack load gcc@9.3.0 %gcc@4.8.5 target=x86_64 + spack install gcc@9.3.0 %gcc@8.5.0 target=x86_64 + spack load gcc@9.3.0 %gcc@8.5.0 target=x86_64 spack compiler find --scope site if ! spack env list | grep -q openfoam; then @@ -114,6 +114,11 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: setup_openfoam.sh @@ -122,8 +127,6 @@ deployment_groups: source /apps/spack/share/spack/setup-env.sh spack env activate openfoam chmod -R a+rwX /apps/spack/var/spack/environments/openfoam - mkdir -p /apps/openfoam - chmod a+rwx /apps/openfoam - type: data destination: /apps/openfoam/submit_openfoam.sh content: | @@ -153,25 +156,27 @@ deployment_groups: mpirun -n 60 -npernode 30 -hostfile hostfile snappyHexMesh -overwrite -parallel mpirun -n 60 -npernode 30 -hostfile hostfile potentialFoam -parallel mpirun -n 60 -npernode 30 -hostfile hostfile simpleFoam -parallel + + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 2 + bandwidth_tier: gvnic_enabled + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 + is_default: true - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition settings: controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller + controller_startup_scripts_timeout: 21600 + disable_controller_public_ips: false From cd16da6afec21bbd2fe9f6fedbcfa11d8807b69b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 8 Jan 2024 13:27:03 -0800 Subject: [PATCH 017/151] Add test that check that modules don't output forbidden names (#2091) --- pkg/inspect/modules_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pkg/inspect/modules_test.go b/pkg/inspect/modules_test.go index 4d6f373aeb..e0558c1de6 100644 --- a/pkg/inspect/modules_test.go +++ b/pkg/inspect/modules_test.go @@ -197,3 +197,24 @@ func TestMetadataInjectModuleId(t *testing.T) { }) } } + +func TestOutputForbiddenNames(t *testing.T) { + nowhere := []string{} + allowed := map[string][]string{ + // Global blueprint variables we don't want to get overwritten. + "project_id": {"community/modules/project/new-project"}, + "labels": nowhere, + "region": nowhere, + "zone": nowhere, + "deployment_name": nowhere, + } + for _, mod := range query(all()) { + t.Run(mod.Source, func(t *testing.T) { + for _, out := range mod.Outputs { + if where, ok := allowed[out.Name]; ok && !slices.Contains(where, mod.Source) { + t.Errorf("forbidden name for output %q", out.Name) + } + } + }) + } +} From 18a060b01aef0358db304772737a6c9b928ff34d Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 19:04:41 +0000 Subject: [PATCH 018/151] Remove hpc-slurm-legacy example and references --- community/examples/hpc-slurm-legacy.yaml | 113 ------------------ examples/README.md | 49 -------- .../daily-tests/builds/hpc-high-io-v4.yaml | 53 -------- .../daily-tests/tests/hpc-high-io.yml | 36 ------ 4 files changed, 251 deletions(-) delete mode 100644 community/examples/hpc-slurm-legacy.yaml delete mode 100644 tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/hpc-high-io.yml diff --git a/community/examples/hpc-slurm-legacy.yaml b/community/examples/hpc-slurm-legacy.yaml deleted file mode 100644 index d8eafeb4f5..0000000000 --- a/community/examples/hpc-slurm-legacy.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-legacy - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-high-io - region: us-west4 - zone: us-west4-c - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: projectsfs - source: modules/file-system/filestore - use: [network1] - settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 - local_mount: /projects - - # This file system has an associated license cost. - # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - - id: low_cost_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - - projectsfs - settings: - partition_name: low_cost - max_node_count: 10 - enable_placement: false - exclusive: false - machine_type: n2-standard-4 - bandwidth_tier: gvnic_enabled - - # This compute_partition is far more performant than low_cost_partition. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - - projectsfs - settings: - max_node_count: 200 - partition_name: compute - bandwidth_tier: gvnic_enabled - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - scratchfs - - projectsfs - - low_cost_partition # low cost partition will be default as it is listed first - - compute_partition - settings: - controller_machine_type: c2-standard-8 - suspend_time: 60 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - scratchfs - - projectsfs - - slurm_controller - settings: - login_machine_type: n2-standard-4 - - - id: hpc_dashboard - source: modules/monitoring/dashboard - outputs: [instructions] diff --git a/examples/README.md b/examples/README.md index a677b8ffb5..8c4365bde7 100644 --- a/examples/README.md +++ b/examples/README.md @@ -43,7 +43,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm-chromedesktop.yaml](#hpc-slurm-chromedesktopyaml--) ![community-badge] ![experimental-badge] * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge] * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-legacy.yaml](#hpc-slurm-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-slurm-legacy-sharedvpc.yaml](#hpc-slurm-legacy-sharedvpcyaml--) ![community-badge] ![deprecated-badge] * [Blueprint Schema](#blueprint-schema) * [Writing an HPC Blueprint](#writing-an-hpc-blueprint) @@ -1025,54 +1024,6 @@ See [README](../community/examples/flux-framework/README.md) [flux-cluster.yaml]: ../community/examples/flux-framework/flux-cluster.yaml -### [hpc-slurm-legacy.yaml] ![community-badge] ![deprecated-badge] - -Creates a Slurm cluster with tiered file systems for higher performance. It -connects to the default VPC of the project and creates two partitions and a -login node. - -File systems: - -* The homefs mounted at `/home` is a default "BASIC_HDD" tier filestore with - 1 TiB of capacity -* The projectsfs is mounted at `/projects` and is a high scale SSD filestore - instance with 10TiB of capacity. -* The scratchfs is mounted at `/scratch` and is a - [DDN Exascaler Lustre](../community/modules/file-system/DDN-EXAScaler/README.md) - file system designed for high IO performance. The capacity is ~10TiB. - -> **Warning**: The DDN Exascaler Lustre file system has a license cost as -> described in the pricing section of the -> [DDN EXAScaler Cloud Marketplace Solution](https://console.developers.google.com/marketplace/product/ddnstorage/). - -There are two partitions in this example: `low_cost` and `compute`. The -`low_cost` partition uses `n2-standard-4` VMs. This partition can be used for -debugging and workloads that do not require high performance. - -Similar to the small example, there is a -[compute partition](#compute-partition) that should be used for any performance -analysis. - -#### Quota Requirements for hpc-slurm-legacy.yaml - -For this example the following is needed in the selected region: - -* Cloud Filestore API: Basic HDD (Standard) capacity (GB) per region: **1,024 GB** -* Cloud Filestore API: High Scale SSD capacity (GB) per region: **10,240 GiB** - _min - quota request is 61,440 GiB_ -* Compute Engine API: Persistent Disk SSD (GB): **~14,050 GB** -* Compute Engine API: Persistent Disk Standard (GB): **~396 GB static + 20 - GB/node** up to 4596 GB -* Compute Engine API: N2 CPUs: **158** -* Compute Engine API: C2 CPUs: **8** for controller node and **60/node** active - in `compute` partition up to 12,008 -* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only - needed for `compute` partition_ -* Compute Engine API: Resource policies: **one for each job in parallel** - - _only needed for `compute` partition_ - -[hpc-slurm-legacy.yaml]: ../community/examples/hpc-slurm-legacy.yaml - ### [hpc-slurm-legacy-sharedvpc.yaml] ![community-badge] ![deprecated-badge] This blueprint demonstrates the use of the Slurm and Filestore modules in diff --git a/tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml b/tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml deleted file mode 100644 index 6d53f54539..0000000000 --- a/tools/cloud-build/daily-tests/builds/hpc-high-io-v4.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" -## Test Slurm High IO Example (Slurm on GCP v4) -- id: hpc-high-io-v4 - waitFor: ["fetch_builder", "build_ghpc"] - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-high-io.yml" diff --git a/tools/cloud-build/daily-tests/tests/hpc-high-io.yml b/tools/cloud-build/daily-tests/tests/hpc-high-io.yml deleted file mode 100644 index 5131d565b9..0000000000 --- a/tools/cloud-build/daily-tests/tests/hpc-high-io.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: hpc-slurm-legacy -deployment_name: "hpc-high-io-{{ build }}" -zone: us-west4-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-legacy.yaml" -network: "default" -max_nodes: 5 -login_node: "slurm-{{ deployment_name }}-login0" -controller_node: "slurm-{{ deployment_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - - low_cost - mounts: - - /home - - /scratch - - /projects From f332eb85745dccb78f27041ed8a8cd1bf1c6a0e5 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 19:22:39 +0000 Subject: [PATCH 019/151] Remove pre existing fs example and references --- .../test_configs/pre-existing-fs.yaml | 63 ------------------- 1 file changed, 63 deletions(-) delete mode 100644 tools/validate_configs/test_configs/pre-existing-fs.yaml diff --git a/tools/validate_configs/test_configs/pre-existing-fs.yaml b/tools/validate_configs/test_configs/pre-existing-fs.yaml deleted file mode 100644 index 252ac2a207..0000000000 --- a/tools/validate_configs/test_configs/pre-existing-fs.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: pre-existing-fs - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: pre-fs-slurm - region: europe-west4 - zone: europe-west4-a - local_mount: /home - network_name: default - -deployment_groups: -- group: storage - modules: - - id: network0 - source: modules/network/pre-existing-vpc - - - id: homefs-filestore - source: modules/file-system/filestore - use: [network0] - -- group: compute - modules: - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: "" # for now, must be completed manually in compute/main.tf - remote_mount: nfsshare - local_mount: $(vars.local_mount) # automatic, added here for clarity - fs_type: nfs - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: compute - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - compute-partition - - network1 From 8b9c783cc2f6fb8ebeed637bb08bda96217607c6 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 19:30:21 +0000 Subject: [PATCH 020/151] Remove slurm-two-partitions-workstation example and references --- .../slurm-two-partitions-workstation.yaml | 68 ------------------- 1 file changed, 68 deletions(-) delete mode 100644 tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml diff --git a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml deleted file mode 100644 index 0dbb3ad57c..0000000000 --- a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: workstation - source: modules/compute/vm-instance - use: - - network1 - - homefs - settings: - name_prefix: workstation - machine_type: e2-standard-8 - - - id: compute-partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: compute - - - id: debug-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - network1 - settings: - partition_name: debug - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - compute-partition - - debug-partition - - network1 From ae543b79e3bce09a8ecf5ae002dd3d70afb41e0e Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:02:01 +0000 Subject: [PATCH 021/151] Remove use-resources example and references --- .../test_configs/use-resources.yaml | 79 ------------------- 1 file changed, 79 deletions(-) delete mode 100644 tools/validate_configs/test_configs/use-resources.yaml diff --git a/tools/validate_configs/test_configs/use-resources.yaml b/tools/validate_configs/test_configs/use-resources.yaml deleted file mode 100644 index d2e39eeeb2..0000000000 --- a/tools/validate_configs/test_configs/use-resources.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: use-modules - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-use-modules - region: us-central1 - zone: us-central1-a - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, - # / as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - - id: projectsfs - source: community/modules/file-system/nfs-server - use: [network1] - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - settings: - local_mount: /scratch - network_self_link: $(network1.network_self_link) - subnetwork_self_link: $(network1.subnetwork_self_link) - subnetwork_address: $(network1.subnetwork_address) - - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - scratchfs - - network1 - settings: - max_node_count: 200 - partition_name: compute - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - projectsfs - - compute_partition - - network1 - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - homefs - - scratchfs - - projectsfs - - slurm_controller - - network1 - settings: - login_machine_type: n2-standard-4 From 6a5008b56d41af2301ed838a7c2276558dfe082b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:16:45 +0000 Subject: [PATCH 022/151] Remove lustre-new-vpc example and references --- .../blueprints/lustre-with-new-vpc.yaml | 129 ------------------ .../daily-tests/builds/lustre-new-vpc.yaml | 54 -------- .../daily-tests/tests/lustre-new-vpc.yml | 34 ----- 3 files changed, 217 deletions(-) delete mode 100644 tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml delete mode 100644 tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml deleted file mode 100644 index 5d5d39db71..0000000000 --- a/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: lustre-new-vpc - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: lustre-new-vpc - region: us-west4 - zone: us-west4-c - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/pre-existing-vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - # these runners are no longer necessary, but it is important that we test it still works - # even when added twice - - id: mount-exascaler - source: modules/scripts/startup-script - settings: - runners: - - $(scratchfs.install_ddn_lustre_client_runner) - - $(scratchfs.mount_runner) - - # Create a separate workstation to catch regressions in vm-instance - - id: workstation - source: ./modules/compute/vm-instance - use: - - network1 - - homefs - - mount-exascaler - settings: - name_prefix: test-workstation1 - add_deployment_name_before_prefix: true - machine_type: c2-standard-4 - - - id: wait0 - source: ./community/modules/scripts/wait-for-startup - settings: - instance_name: $(workstation.name[0]) - - # test installing lustre from pre-existing-network-storage - - id: pre-fs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: $(scratchfs.network_storage.server_ip) - remote_mount: $(scratchfs.network_storage.remote_mount) - local_mount: $(scratchfs.network_storage.local_mount) - fs_type: $(scratchfs.network_storage.fs_type) - - - id: mount-exascaler-from-pre-existing - source: modules/scripts/startup-script - settings: - runners: - - $(pre-fs.client_install_runner) - - $(pre-fs.mount_runner) - - - id: install-luster-from-pre-existing - source: modules/compute/vm-instance - use: - - network1 - - mount-exascaler-from-pre-existing - settings: - name_prefix: test-workstation2 - add_deployment_name_before_prefix: false - machine_type: n2-standard-4 - - - id: wait1 - source: ./community/modules/scripts/wait-for-startup - settings: - instance_name: $(install-luster-from-pre-existing.name[0]) - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - scratchfs - settings: - max_node_count: 2 - partition_name: compute - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - scratchfs - - compute_partition - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - scratchfs - - slurm_controller - settings: - login_machine_type: n2-standard-4 diff --git a/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml b/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml deleted file mode 100644 index 6d73ff0da8..0000000000 --- a/tools/cloud-build/daily-tests/builds/lustre-new-vpc.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -## Test DDN Lustre with new VPC -- id: lustre-new-vpc - waitFor: ["fetch_builder", "build_ghpc"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" --extra-vars="@tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml" diff --git a/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml b/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml deleted file mode 100644 index 8daf8bace0..0000000000 --- a/tools/cloud-build/daily-tests/tests/lustre-new-vpc.yml +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: lustre-new-vpc -deployment_name: "lustre-new-vpc-{{ build }}" -zone: us-west4-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/lustre-with-new-vpc.yaml" -network: "{{deployment_name}}-net" -max_nodes: 5 -login_node: "slurm-{{ deployment_name }}-login0" -controller_node: "slurm-{{ deployment_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - mounts: - - /home - - /scratch From 3d92fc45392dd8f0ca32d8c660ec6461ced1358b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:34:06 +0000 Subject: [PATCH 023/151] Remove test-gcs-fuse example and references --- .../test_configs/test-gcs-fuse.yaml | 146 ------------------ 1 file changed, 146 deletions(-) delete mode 100644 tools/validate_configs/test_configs/test-gcs-fuse.yaml diff --git a/tools/validate_configs/test_configs/test-gcs-fuse.yaml b/tools/validate_configs/test_configs/test-gcs-fuse.yaml deleted file mode 100644 index 40d1228d1e..0000000000 --- a/tools/validate_configs/test_configs/test-gcs-fuse.yaml +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: gcs-fuse - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: gcs-fuse - region: us-central1 - zone: us-central1-c - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: gcs - source: ./modules/file-system/pre-existing-network-storage - settings: - remote_mount: hpc-toolkit-service-catalog-solutions - local_mount: /catalog - fs_type: gcsfuse - - - id: gcs2 - source: ./modules/file-system/pre-existing-network-storage - settings: - server_ip: foobar - remote_mount: gs://hpc-toolkit-demo-tf-state - local_mount: /tfstate - fs_type: gcsfuse - mount_options: implicit_dirs,defaults,allow_other - -# find images with: gcloud compute images list - - id: compute-hpc-image - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: hpc-image - - - id: compute-ubuntu2204 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: ubuntu2204 - instance_image: - family: ubuntu-2204-lts - project: ubuntu-os-cloud - - - id: compute-ubuntu2004 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: ubuntu2004 - instance_image: - family: ubuntu-2004-lts - project: ubuntu-os-cloud - - - id: compute-debian - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: debian11 - instance_image: - family: debian-11 - project: debian-cloud - - - id: centos08 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: centos08 - instance_image: - family: centos-stream-9 - project: centos-cloud - - - id: centos07 - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: centos07 - instance_image: - family: centos-7 - project: centos-cloud - - - id: rocky - source: ./modules/compute/vm-instance - use: [network1, gcs2, gcs] - settings: - machine_type: n2-standard-2 - name_prefix: rocky-linux-8 - instance_image: - family: rocky-linux-8 - project: rocky-linux-cloud - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - gcs - - gcs2 - - network1 - settings: - partition_name: compute - machine_type: n2-standard-4 - - - id: slurm-controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - gcs - - gcs2 - - compute-partition - - network1 - settings: - login_node_count: 1 - compute_node_scopes: - - https://www.googleapis.com/auth/cloud-platform - - https://www.googleapis.com/auth/devstorage.read_only - disable_compute_public_ips: false - - - id: slurm-login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - slurm-controller - - network1 From b848d0d64f3fde75a463ef50c8fe8b03c5dcab0e Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:39:31 +0000 Subject: [PATCH 024/151] Remove hpc-cluster-service-acct example and references --- .../hpc-cluster-service-acct.yaml | 63 ------------------- 1 file changed, 63 deletions(-) delete mode 100644 tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml diff --git a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml deleted file mode 100644 index f849d01ef9..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-sa - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/pre-existing-network-storage - settings: - server_ip: '$controller' - remote_mount: /home - local_mount: /home - fs_type: nfs - - - id: service_acct - source: ./community/modules/project/service-account - settings: - project_id: $(vars.project_id) - name: hpc-service-acct - project_roles: - - compute.instanceAdmin.v1 - - - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [network1] - settings: - partition_name: compute - network_storage: - - $(homefs.network_storage) - - - id: slurm - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: [network1] - settings: - network_storage: - - $(homefs.network_storage) - partition: - - $(compute-partition.partition) - controller_service_account: $(service_acct.service_account_email) From 6debc3ac0c9e7bdcde944c330e07b7ccab545e94 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:46:46 +0000 Subject: [PATCH 025/151] Remove hpc-cluste-slurm-with-startup example and references --- .../hpc-cluster-slurm-with-startup.yaml | 75 ------------------- 1 file changed, 75 deletions(-) delete mode 100644 tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml diff --git a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml deleted file mode 100644 index 3da2f9fc07..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-small - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-small - region: europe-west4 - zone: europe-west4-a - -deployment_groups: -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local or community module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: startup - source: modules/scripts/startup-script - settings: - install_ansible: true - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - enable_placement: false - machine_type: n2-standard-4 - cpu_platform: Intel Ice Lake - partition_name: compute - max_node_count: 20 - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 - controller_startup_script: $(startup.startup_script) - compute_startup_script: $(startup.startup_script) - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_startup_script: $(startup.startup_script) From 52faa796a430eee95e6b57c58891fbdecb01f251 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:51:35 +0000 Subject: [PATCH 026/151] Remove hpc-cluster-project exampke and references --- .../test_configs/hpc-cluster-project.yaml | 89 ------------------- 1 file changed, 89 deletions(-) delete mode 100644 tools/validate_configs/test_configs/hpc-cluster-project.yaml diff --git a/tools/validate_configs/test_configs/hpc-cluster-project.yaml b/tools/validate_configs/test_configs/hpc-cluster-project.yaml deleted file mode 100644 index b900649f2a..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-project.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-project - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-project - region: europe-west4 - zone: europe-west4-a - -terraform_backend_defaults: - type: gcs - configuration: - bucket: a_bucket - impersonate_service_account: a_bucket_reader@project.iam.gserviceaccount.com - -deployment_groups: -- group: onboarding - modules: - - id: project - source: ./community/modules/project/new-project - settings: - project_id: $(vars.project_id) - folder_id: 334688113020 # random number - billing_account: "111110-M2N704-854685" # random billing number - org_id: 123456789 # random org id - - - id: enable-apis - source: ./community/modules/project/service-enablement - use: [project] - settings: - gcp_service_list: - - "file.googleapis.com" - - "compute.googleapis.com" - -- group: primary - modules: - # Source is an embedded module, denoted by "modules/*" without ./, ../, / - # as a prefix. To refer to a local module, prefix with ./, ../ or / - # Example - ./modules/network/vpc - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: compute - machine_type: n1-standard-2 - enable_placement: false - max_node_count: 20 - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 - - - id: slurm_login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller From 798c7ed303863d34546cd525f0c8c3361f590c72 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:59:17 +0000 Subject: [PATCH 027/151] Remove hpc-cluster-high-io-remote-state example and references --- tools/validate_configs/test_configs/README.md | 3 - .../hpc-cluster-high-io-remote-state.yaml | 84 ------------------- 2 files changed, 87 deletions(-) delete mode 100644 tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml diff --git a/tools/validate_configs/test_configs/README.md b/tools/validate_configs/test_configs/README.md index 5849fe3c97..cabaa06f23 100644 --- a/tools/validate_configs/test_configs/README.md +++ b/tools/validate_configs/test_configs/README.md @@ -21,9 +21,6 @@ supplied guest accelerators are adding to the VM instances. filestore as a /home directory and a network. This has been used as a demo blueprint when presenting the toolkit. -**hpc-cluster-high-io-remote-state.yaml**: Creates a cluster with high -performance IO system with all Terraform state stored remotely. - **hpc-cluster-2filestore-4s_instance.yaml**: A slightly more complicated HPC cluster that includes 2 filestore (/home and /shared), two license servers, a head-node and 2 compute vms diff --git a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml b/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml deleted file mode 100644 index 1b28a603a1..0000000000 --- a/tools/validate_configs/test_configs/hpc-cluster-high-io-remote-state.yaml +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-high-io-remote-state - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-io - region: us-central1 - zone: us-central1-a - -terraform_backend_defaults: - type: gcs - configuration: - bucket: a_bucket - impersonate_service_account: a_bucket_reader@project.iam.gserviceaccount.com - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: projectsfs - source: modules/file-system/filestore - use: [network1] - settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 - local_mount: /projects - - - id: scratchfs - source: community/modules/file-system/DDN-EXAScaler - use: [network1] - settings: - local_mount: /scratch - - - id: compute_partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - homefs - - scratchfs - - projectsfs - - network1 - settings: - max_node_count: 200 - partition_name: compute - - - id: slurm_controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - homefs - - scratchfs - - projectsfs - - compute_partition - - network1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - homefs - - scratchfs - - projectsfs - - slurm_controller - - network1 From 968d0b3306141191127e7962b525f5abf686bed2 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 9 Jan 2024 20:26:11 +0000 Subject: [PATCH 028/151] Update test outputs example and remove slurm partition and controller --- .../test_configs/test_outputs.yaml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 3e7832efa0..5de0b7bc21 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -130,14 +130,6 @@ deployment_groups: outputs: - startup_script - - id: partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [vpc] - outputs: - - partition - settings: - partition_name: compute - - id: lustre source: ./community/modules/file-system/DDN-EXAScaler outputs: @@ -146,11 +138,3 @@ deployment_groups: - mount_command - http_console - network_storage - - - id: controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - partition - - vpc - outputs: - - controller_name From 5f3b59e6e2688c8089c5f8c8aef70a021fcdf1d8 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 9 Jan 2024 15:28:26 -0800 Subject: [PATCH 029/151] Slurm6. Support `additional_networks`,`reservation_name` & `access_config` (#2062) * Add support for `additional_networks` & `reservation_name`; * Nodeset. Pass `access_config`, do not use `enable_public_ips` --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 3 ++ .../schedmd-slurm-gcp-v6-nodeset/main.tf | 7 +++- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 42 +++++++++++++++++++ .../schedmd-slurm-gcp-v6-partition/README.md | 2 +- .../variables.tf | 19 ++++++++- .../schedmd-slurm-gcp-v6-controller/README.md | 28 ++++++------- .../controller.tf | 8 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 +- .../partition.tf | 12 +++--- .../slurm_files.tf | 2 +- .../variables.tf | 35 ++++++++++++---- 11 files changed, 124 insertions(+), 38 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index c754ba63ab..3ba74f0637 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -153,7 +153,9 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | @@ -182,6 +184,7 @@ No modules. | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | +| [reservation\_name](#input\_reservation\_name) | Sets reservation affinity for instances created from this nodeset. | `string` | `null` | no | | [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 01eb6cb240..810795864e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -32,6 +32,9 @@ locals { } ] + public_access_config = var.disable_public_ips ? [] : [{ nat_ip = null, network_tier = null }] + access_config = length(var.access_config) == 0 ? local.public_access_config : var.access_config + nodeset = { node_count_static = var.node_count_static node_count_dynamic_max = var.node_count_dynamic_max @@ -50,7 +53,6 @@ locals { enable_confidential_vm = var.enable_confidential_vm enable_placement = var.enable_placement - enable_public_ip = !var.disable_public_ips enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm gpu = one(local.guest_accelerator) @@ -70,9 +72,12 @@ locals { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf subnetwork_self_link = var.subnetwork_self_link + additional_networks = var.additional_networks + access_config = local.access_config tags = var.tags spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) + reservation_name = var.reservation_name zones = toset(concat([var.zone], tolist(var.zones))) zone_target_shape = var.zone_target_shape diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 7d7d964840..a947c6a441 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -373,3 +373,45 @@ variable "subnetwork_self_link" { type = string description = "Subnet to deploy to." } + +variable "additional_networks" { + description = "Additional network interface details for GCE, if any." + default = [] + type = list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + nic_type = string + stack_type = string + queue_count = number + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + alias_ip_range = list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })) + })) +} + +variable "access_config" { + description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." + type = list(object({ + nat_ip = string + network_tier = string + })) + default = [] +} + +variable "reservation_name" { + description = <<-EOD + Sets reservation affinity for instances created from this nodeset. + EOD + type = string + default = null +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 7b68a670fd..523bf0d997 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -85,7 +85,7 @@ No resources. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 1cc821a878..2190907599 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -85,7 +85,6 @@ variable "nodeset" { disk_type = optional(string) enable_confidential_vm = optional(bool, false) enable_placement = optional(bool, false) - enable_public_ip = optional(bool, false) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) gpu = optional(object({ @@ -113,12 +112,30 @@ variable "nodeset" { source_image_family = optional(string) source_image_project = optional(string) source_image = optional(string) + additional_networks = optional(list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + }))) + access_config = optional(list(object({ + nat_ip = string + network_tier = string + }))) subnetwork_self_link = string spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) zones = optional(list(string), []) zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + reservation_name = optional(string) })) default = [] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 0249667267..05d8feee3f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.2.0 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.2.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.2.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.2.0 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.2.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.2.0 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.0 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.2.0 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.2.0 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.2.1 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.2.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.2.1 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.2.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.2.1 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.2.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.2.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.2.1 | ## Resources @@ -177,8 +177,8 @@ limitations under the License. | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | +| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | +| [enable\_devel](#input\_enable\_devel) | Enables development mode. | `bool` | `true` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | @@ -197,7 +197,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | | [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 6802694342..a31c765135 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.2.1" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -149,7 +149,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.2.1" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name @@ -160,7 +160,7 @@ module "cleanup_compute_nodes" { # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.2.1" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 839a6c238c..09d1a0a5d2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -59,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.2.1" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 7d53d50327..b2e030cf59 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" for_each = local.nodeset_map project_id = var.project_id @@ -54,17 +54,18 @@ module "slurm_nodeset_template" { source_image_project = each.value.source_image_project source_image = each.value.source_image subnetwork = each.value.subnetwork_self_link + additional_networks = each.value.additional_networks + access_config = each.value.access_config tags = concat([local.slurm_cluster_name], each.value.tags) } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.2.1" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link enable_placement = each.value.enable_placement - enable_public_ip = each.value.enable_public_ip network_tier = each.value.network_tier node_count_dynamic_max = each.value.node_count_dynamic_max node_count_static = each.value.node_count_static @@ -73,11 +74,12 @@ module "slurm_nodeset" { subnetwork_self_link = each.value.subnetwork_self_link zones = each.value.zones zone_target_shape = each.value.zone_target_shape + reservation_name = each.value.reservation_name } # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.2.1" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -99,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.2.1" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 3593858220..f6a264198b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.2.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.2.1" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 38c9f03546..80e0c996b9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -184,7 +184,6 @@ variable "nodeset" { disk_type = optional(string) enable_confidential_vm = optional(bool, false) enable_placement = optional(bool, false) - enable_public_ip = optional(bool, false) enable_oslogin = optional(bool, true) enable_shielded_vm = optional(bool, false) gpu = optional(object({ @@ -213,11 +212,29 @@ variable "nodeset" { source_image_project = optional(string) source_image = optional(string) subnetwork_self_link = string - spot = optional(bool, false) - tags = optional(list(string), []) - termination_action = optional(string) - zones = optional(list(string), []) - zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + additional_networks = optional(list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + }))) + access_config = optional(list(object({ + nat_ip = string + network_tier = string + }))) + spot = optional(bool, false) + tags = optional(list(string), []) + termination_action = optional(string) + zones = optional(list(string), []) + zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + reservation_name = optional(string) })) default = [] @@ -303,13 +320,13 @@ EOD variable "enable_devel" { type = bool - description = "Enables development mode. Not for production use." - default = false + description = "Enables development mode." + default = true } variable "enable_debug_logging" { type = bool - description = "Enables debug logging mode. Not for production use." + description = "Enables debug logging mode." default = false } From e332bf2bcd7ee2dac0376af0f1e1d6b3c8c9db93 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 9 Jan 2024 18:10:36 -0800 Subject: [PATCH 030/151] Add zone finding for cpu partitions in hpc-enterprise-slurm test --- tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml index 9b2b39d64a..2d1596cb0c 100644 --- a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml @@ -22,6 +22,7 @@ zone: europe-west1-d cli_deployment_vars: region: europe-west1 zone: "{{ zone }}" + zones: "[europe-west1-b,europe-west1-c,europe-west1-d]" workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hpc-enterprise-slurm.yaml" network: "default" From d807c50ea8e9bb968fd03427d2a028d817b90efb Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 9 Jan 2024 18:25:38 -0800 Subject: [PATCH 031/151] Rename GKE subnet with build id to avoid conflicts --- tools/cloud-build/daily-tests/builds/gke-storage.yaml | 4 ++-- tools/cloud-build/daily-tests/builds/gke.yaml | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 16809991aa..a861704b3d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -59,8 +59,8 @@ steps: echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} echo ' zone: us-central1-a' >> $${SG_EXAMPLE} - # avoids conflict if both gke tests are run at the same time - sed -i "s/gke-subnet/gke-storage-subnet/" $${SG_EXAMPLE} + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 5826223fb0..5281446e6a 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -64,6 +64,9 @@ steps: echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke.yml" From 9522d15734580c7c0eb8732691a8c3221418e046 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 9 Jan 2024 20:51:52 -0800 Subject: [PATCH 032/151] Fix slurm v6 links in example README --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index a677b8ffb5..b0356df70b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,8 +13,8 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] - * [hpc-slurm6.yaml](#hpc-slurm6yaml-) ![community-badge] ![experimental-badge] - * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml-) ![community-badge] ![experimental-badge] + * [hpc-slurm6.yaml](#hpc-slurm6yaml--) ![community-badge] ![experimental-badge] + * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] From 6df534e3430747546cc7edef508b1193a4a2a533 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 9 Jan 2024 23:56:03 -0800 Subject: [PATCH 033/151] Add startup script option to install stackdriver agent --- modules/scripts/startup-script/README.md | 16 ++++++- ...s_agent.sh => install_monitoring_agent.sh} | 44 +++++++++++++++---- modules/scripts/startup-script/main.tf | 24 +++++++--- modules/scripts/startup-script/variables.tf | 8 +++- 4 files changed, 75 insertions(+), 17 deletions(-) rename modules/scripts/startup-script/files/{install_cloud_ops_agent.sh => install_monitoring_agent.sh} (65%) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index dbad630070..c27bba68ed 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -130,6 +130,17 @@ To view outputs from a Linux startup script, run: sudo journalctl -u google-startup-scripts.service ``` +### Monitoring Agent Installation + +This `startup-script` module has several options for installing a Google +monitoring agent. There are two relevant settings: `install_stackdriver_agent` +and `install_cloud_ops_agent`. + +The _Stackdriver Agent_ also called the _Legacy Cloud Monitoring Agent_ provides +better performance under some HPC workloads. While official documentation +recommends using the _Cloud Ops Agent_, it is recommended to use +`install_stackdriver_agent` when performance is important. + ### Example ```yaml @@ -171,7 +182,7 @@ they are able to do so by using the `gcs_bucket_path` as shown in the below exam source: ./modules/scripts/startup-script settings: gcs_bucket_path: gs://user-test-bucket/folder1/folder2 - install_cloud_ops_agent: true + install_stackdriver_agent: true - id: compute-cluster source: ./modules/compute/vm-instance @@ -238,7 +249,8 @@ No modules. | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [http\_proxy](#input\_http\_proxy) | Web (http and https) proxy configuration for pip, apt, and yum/dnf | `string` | `""` | no | | [install\_ansible](#input\_install\_ansible) | Run Ansible installation script if either set to true or unset and runner of type 'ansible-local' are used. | `bool` | `null` | no | -| [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | +| [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | +| [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | | [prepend\_ansible\_installer](#input\_prepend\_ansible\_installer) | DEPRECATED. Use `install_ansible=false` to prevent ansible installation. | `bool` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | diff --git a/modules/scripts/startup-script/files/install_cloud_ops_agent.sh b/modules/scripts/startup-script/files/install_monitoring_agent.sh similarity index 65% rename from modules/scripts/startup-script/files/install_cloud_ops_agent.sh rename to modules/scripts/startup-script/files/install_monitoring_agent.sh index fbafe17110..2a48f310f5 100644 --- a/modules/scripts/startup-script/files/install_cloud_ops_agent.sh +++ b/modules/scripts/startup-script/files/install_monitoring_agent.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,10 +14,15 @@ # limitations under the License. LEGACY_MONITORING_PACKAGE='stackdriver-agent' +LEGACY_MONITORING_SCRIPT_URL='https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh' LEGACY_LOGGING_PACKAGE='google-fluentd' +LEGACY_LOGGING_SCRIPT_URL='https://dl.google.com/cloudagents/add-logging-agent-repo.sh' + OPSAGENT_PACKAGE='google-cloud-ops-agent' OPSAGENT_SCRIPT_URL='https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh' +install_legacy="${1:-true}" + fail() { echo >&2 "[$(date +'%Y-%m-%dT%H:%M:%S%z')] $*" exit 1 @@ -43,19 +48,29 @@ handle_debian() { grep "${OPSAGENT_PACKAGE} is installed" } - install_opsagent() { + install_with_retry() { MAX_RETRY=50 RETRY=0 - until [ ${RETRY} -eq ${MAX_RETRY} ] || curl -s "${OPSAGENT_SCRIPT_URL}" | bash -s -- --also-install; do + until [ ${RETRY} -eq ${MAX_RETRY} ] || curl -s "${1}" | bash -s -- --also-install; do RETRY=$((RETRY + 1)) - echo "WARNING: Cloud ops installation failed on try ${RETRY} of ${MAX_RETRY}" + echo "WARNING: Installation of ${1} failed on try ${RETRY} of ${MAX_RETRY}" sleep 5 done if [ $RETRY -eq $MAX_RETRY ]; then - echo "ERROR: Cloud ops installation was not successful after ${MAX_RETRY} attempts." + echo "ERROR: Installation of ${1} was not successful after ${MAX_RETRY} attempts." exit 1 fi } + + install_opsagent() { + install_with_retry "${OPSAGENT_SCRIPT_URL}" + } + + install_stackdriver_agent() { + install_with_retry "${LEGACY_MONITORING_SCRIPT_URL}" + install_with_retry "${LEGACY_LOGGING_SCRIPT_URL}" + service stackdriver-agent start + } } handle_redhat() { @@ -79,7 +94,13 @@ handle_redhat() { } install_opsagent() { - curl -s https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh | bash -s -- --also-install + curl -s "${OPSAGENT_SCRIPT_URL}" | bash -s -- --also-install + } + + install_stackdriver_agent() { + curl -sS "${LEGACY_MONITORING_SCRIPT_URL}" | bash -s -- --also-install + curl -sS "${LEGACY_LOGGING_SCRIPT_URL}" | bash -s -- --also-install + service stackdriver-agent start } } @@ -93,10 +114,17 @@ main() { fi if is_legacy_installed || is_opsagent_installed; then - fail "Legacy or Ops Agent is already installed." + fail "Legacy (stackdriver) or Ops Agent is already installed." fi - install_opsagent + if [[ "${install_legacy}" == true ]]; then + echo "Installing legacy monitoring agent (stackdriver)" + install_stackdriver_agent + else + echo "Installing cloud ops agent" + echo "WARNING: cloud ops agent may have a performance impact. Consider using legacy monitoring agent (stackdriver)." + install_opsagent + fi } main diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 152d63ed67..dc6b07af2a 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -20,11 +20,16 @@ locals { } locals { - ops_agent_installer = var.install_cloud_ops_agent ? [{ - type = "shell" - source = "${path.module}/files/install_cloud_ops_agent.sh" - destination = "install_cloud_ops_agent_automatic.sh" - }] : [] + monitoring_agent_installer = ( + var.install_cloud_ops_agent || var.install_stackdriver_agent ? + [{ + type = "shell" + source = "${path.module}/files/install_monitoring_agent.sh" + destination = "install_monitoring_agent_automatic.sh" + args = var.install_cloud_ops_agent ? "false" : "true" # install legacy (stackdriver) + }] : + [] + ) warnings = [ { @@ -84,7 +89,7 @@ locals { runners = concat( local.warnings, local.proxy_runner, - local.ops_agent_installer, + local.monitoring_agent_installer, local.ansible_installer, local.configure_ssh_runners, var.runners @@ -167,6 +172,13 @@ resource "google_storage_bucket_object" "scripts" { create = "10m" update = "10m" } + + lifecycle { + precondition { + condition = !var.install_cloud_ops_agent || !var.install_stackdriver_agent + error_message = "Only one of var.install_stackdriver_agent or var.install_cloud_ops_agent can be set. Stackdriver is recommended for best performance." + } + } } resource "local_file" "debug_file" { diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 17d609aba7..e4d56a5999 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -113,7 +113,13 @@ EOT } variable "install_cloud_ops_agent" { - description = "Run Google Ops Agent installation script if set to true." + description = "Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true." + type = bool + default = false +} + +variable "install_stackdriver_agent" { + description = "Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance." type = bool default = false } From 468225afcb693db571d7e0be2b931a4114f4d8e4 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 9 Jan 2024 23:56:58 -0800 Subject: [PATCH 034/151] Update tests to focus on stackdriver while still testing ops agent --- .../test-validation/test-monitoring.yml | 12 ++++-- .../daily-tests/blueprints/monitoring.yaml | 41 +++++++++++++++---- .../daily-tests/tests/monitoring.yml | 2 +- .../test_configs/apt-collision.yaml | 2 +- .../test_configs/ubuntu-ss.yaml | 2 +- 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml index 00d3ca7676..c5c34701a7 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-monitoring.yml @@ -19,24 +19,28 @@ vars: vm_name: "{{ remote_node }}" timeout_seconds: 600 + - name: Gather service facts become: true ansible.builtin.service_facts: -- name: Fail if ops agent is not running + +- name: Fail if stackdriver agent is not running ansible.builtin.assert: that: - - ansible_facts.services["google-cloud-ops-agent.service"].status == "enabled" - - ansible_facts.services["google-cloud-ops-agent-fluent-bit.service"].state == "running" - - ansible_facts.services["google-cloud-ops-agent-opentelemetry-collector.service"].state == "running" + - ansible_facts.services["stackdriver-agent"].status == "enabled" + - ansible_facts.services["stackdriver-agent"].state == "running" + - name: Check that monitoring dashboard has been created changed_when: false ansible.builtin.command: gcloud monitoring dashboards list --format="get(displayName)" run_once: true delegate_to: localhost register: dashboards + - name: Print dashboard information ansible.builtin.debug: var: dashboards + - name: Fail if the HPC Dashboard hasn't been created ansible.builtin.fail: msg: Failed to create dashboard diff --git a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml index c36a55425a..4704259c1c 100644 --- a/tools/cloud-build/daily-tests/blueprints/monitoring.yaml +++ b/tools/cloud-build/daily-tests/blueprints/monitoring.yaml @@ -21,6 +21,8 @@ vars: deployment_name: monitoring region: us-central1 zone: us-central1-c + add_deployment_name_before_prefix: true + machine_type: c2-standard-4 deployment_groups: - group: primary @@ -38,24 +40,47 @@ deployment_groups: - id: bucket-for-startup-script source: community/modules/file-system/cloud-storage-bucket - - id: startup + - id: startup-ops source: modules/scripts/startup-script use: [bucket-for-startup-script] settings: install_cloud_ops_agent: true - - id: workstation - source: ./modules/compute/vm-instance + - id: workstation-ops + source: modules/compute/vm-instance use: - network - homefs - - startup + - startup-ops settings: - machine_type: c2-standard-4 - metadata: - enable-oslogin: true + name_prefix: workstation-ops + + - id: startup-stack + source: modules/scripts/startup-script + use: [bucket-for-startup-script] + settings: + install_stackdriver_agent: true + + - id: workstation-stack + source: modules/compute/vm-instance + use: + - network + - homefs + - startup-stack + settings: + name_prefix: workstation-stackdriver + + - id: wait0 + source: community/modules/scripts/wait-for-startup + settings: + instance_name: $(workstation-ops.name[0]) + + - id: wait1 + source: community/modules/scripts/wait-for-startup + settings: + instance_name: $(workstation-stack.name[0]) - id: hpc-dash - source: ./modules/monitoring/dashboard + source: modules/monitoring/dashboard settings: title: $(vars.deployment_name) diff --git a/tools/cloud-build/daily-tests/tests/monitoring.yml b/tools/cloud-build/daily-tests/tests/monitoring.yml index b459f5f9b5..21866db4c0 100644 --- a/tools/cloud-build/daily-tests/tests/monitoring.yml +++ b/tools/cloud-build/daily-tests/tests/monitoring.yml @@ -20,6 +20,6 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/monitoring.yaml" network: "{{ deployment_name }}-net" -remote_node: "{{ deployment_name }}-0" +remote_node: "{{ deployment_name }}-workstation-stackdriver-0" post_deploy_tests: - test-validation/test-monitoring.yml diff --git a/tools/validate_configs/test_configs/apt-collision.yaml b/tools/validate_configs/test_configs/apt-collision.yaml index 0043ea941a..9ab7a7e8a3 100644 --- a/tools/validate_configs/test_configs/apt-collision.yaml +++ b/tools/validate_configs/test_configs/apt-collision.yaml @@ -42,7 +42,7 @@ deployment_groups: kind: terraform id: startup settings: - install_cloud_ops_agent: true + install_stackdriver_agent: true install_ansible: true - source: modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 35657939d8..67a15a8437 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -63,7 +63,7 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script settings: - install_cloud_ops_agent: true + install_stackdriver_agent: true runners: - type: data source: /tmp/foo.tgz From e6896fdb7ba606a56561b04657c267e07d015813 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 10 Jan 2024 10:37:08 -0800 Subject: [PATCH 035/151] Unify usage and rendering of `HintError` (#2095) --- cmd/create.go | 52 --------------------- cmd/create_test.go | 27 ----------- cmd/render.go | 87 ++++++++++++++++++++++++++++++++++++ cmd/render_test.go | 80 +++++++++++++++++++++++++++++++++ pkg/validators/cloud.go | 6 +-- pkg/validators/validators.go | 13 +++--- 6 files changed, 175 insertions(+), 90 deletions(-) create mode 100644 cmd/render.go create mode 100644 cmd/render_test.go diff --git a/cmd/create.go b/cmd/create.go index 21ff70dbbe..c76c640a2a 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -159,58 +159,6 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) { } -func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { - pos, ok := ctx.Pos(path) - for !ok && path.Parent() != nil { - path = path.Parent() - pos, ok = ctx.Pos(path) - } - return pos, ok -} - -func renderError(err error, ctx config.YamlCtx) string { - switch te := err.(type) { - case config.Errors: - var sb strings.Builder - for _, e := range te.Errors { - sb.WriteString(renderError(e, ctx)) - sb.WriteString("\n") - } - return sb.String() - case validators.ValidatorError: - title := boldRed(fmt.Sprintf("validator %q failed:", te.Validator)) - return fmt.Sprintf("%s\n%v\n", title, renderError(te.Err, ctx)) - case config.BpError: - if pos, ok := findPos(te.Path, ctx); ok { - return renderRichError(te.Err, pos, ctx) - } - return renderError(te.Err, ctx) - default: - return err.Error() - } -} - -func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string { - line := pos.Line - 1 - if line < 0 { - line = 0 - } - if line >= len(ctx.Lines) { - line = len(ctx.Lines) - 1 - } - - pref := fmt.Sprintf("%d: ", pos.Line) - arrow := " " - if pos.Column > 0 { - spaces := strings.Repeat(" ", len(pref)+pos.Column-1) - arrow = spaces + "^" - } - - return fmt.Sprintf(`%s: %s -%s%s -%s`, boldRed("Error"), err, pref, ctx.Lines[line], arrow) -} - func setCLIVariables(bp *config.Blueprint, s []string) error { for _, cliVar := range s { arr := strings.SplitN(cliVar, "=", 2) diff --git a/cmd/create_test.go b/cmd/create_test.go index 94454fe26a..b3f11714ee 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -15,7 +15,6 @@ package cmd import ( - "errors" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/modulewriter" "os" @@ -133,32 +132,6 @@ func (s *MySuite) TestValidationLevels(c *C) { c.Check(setValidationLevel(&bp, "INVALID"), NotNil) } -func (s *MySuite) TestRenderError(c *C) { - { // simple - err := errors.New("arbuz") - got := renderError(err, config.YamlCtx{}) - c.Check(got, Equals, "arbuz") - } - { // has pos, but context doesn't contain it - ctx, _ := config.NewYamlCtx([]byte(``)) - pth := config.Root.Vars.Dot("kale") - err := config.BpError{Path: pth, Err: errors.New("arbuz")} - got := renderError(err, ctx) - c.Check(got, Equals, "arbuz") - } - { // has pos, has context - ctx, _ := config.NewYamlCtx([]byte(` -vars: - kale: dos`)) - pth := config.Root.Vars.Dot("kale") - err := config.BpError{Path: pth, Err: errors.New("arbuz")} - got := renderError(err, ctx) - c.Check(got, Equals, `Error: arbuz -3: kale: dos - ^`) - } -} - func (s *MySuite) TestValidateMaybeDie(c *C) { bp := config.Blueprint{ Validators: []config.Validator{{Validator: "invalid"}}, diff --git a/cmd/render.go b/cmd/render.go new file mode 100644 index 0000000000..6c0013524a --- /dev/null +++ b/cmd/render.go @@ -0,0 +1,87 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "hpc-toolkit/pkg/config" + "hpc-toolkit/pkg/validators" + "strings" +) + +func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { + pos, ok := ctx.Pos(path) + for !ok && path.Parent() != nil { + path = path.Parent() + pos, ok = ctx.Pos(path) + } + return pos, ok +} + +func renderError(err error, ctx config.YamlCtx) string { + switch te := err.(type) { + case config.Errors: + return renderMultiError(te, ctx) + case validators.ValidatorError: + return renderValidatorError(te, ctx) + case config.HintError: + return renderHintError(te, ctx) + case config.BpError: + return renderBpError(te, ctx) + default: + return fmt.Sprintf("%s: %s", boldRed("Error"), err) + } +} + +func renderMultiError(errs config.Errors, ctx config.YamlCtx) string { + var sb strings.Builder + for _, e := range errs.Errors { + sb.WriteString(renderError(e, ctx)) + sb.WriteString("\n") + } + return sb.String() +} + +func renderValidatorError(err validators.ValidatorError, ctx config.YamlCtx) string { + title := boldRed(fmt.Sprintf("validator %q failed:", err.Validator)) + return fmt.Sprintf("%s\n%v\n", title, renderError(err.Err, ctx)) +} + +func renderHintError(err config.HintError, ctx config.YamlCtx) string { + return fmt.Sprintf("%s\n%s: %s", renderError(err.Err, ctx), boldYellow("Hint"), err.Hint) +} + +func renderBpError(err config.BpError, ctx config.YamlCtx) string { + if pos, ok := findPos(err.Path, ctx); ok { + return renderPosError(err.Err, pos, ctx) + } + return renderError(err.Err, ctx) +} + +func renderPosError(err error, pos config.Pos, ctx config.YamlCtx) string { + line := pos.Line - 1 + if line < 0 || line >= len(ctx.Lines) { + return renderError(err, ctx) + } + + pref := fmt.Sprintf("%d: ", pos.Line) + arrow := " " + if pos.Column > 0 { + spaces := strings.Repeat(" ", len(pref)+pos.Column-1) + arrow = spaces + "^" + } + + return fmt.Sprintf("%s\n%s%s\n%s", renderError(err, ctx), pref, ctx.Lines[line], arrow) +} diff --git a/cmd/render_test.go b/cmd/render_test.go new file mode 100644 index 0000000000..a1ad7bded4 --- /dev/null +++ b/cmd/render_test.go @@ -0,0 +1,80 @@ +// Copyright 2024 "Google LLC" +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "errors" + "hpc-toolkit/pkg/config" + "testing" + + "github.com/google/go-cmp/cmp" +) + +func makeCtx(yml string, t *testing.T) config.YamlCtx { + ctx, err := config.NewYamlCtx([]byte(yml)) + if err != nil { + t.Fatal(err, yml) + } + return ctx +} + +func TestRenderError(t *testing.T) { + type test struct { + err error + ctx config.YamlCtx + want string + } + tests := []test{ + {errors.New("arbuz"), makeCtx("", t), "Error: arbuz"}, + { // has pos, but context doesn't contain it + err: config.BpError{Path: config.Root.Vars.Dot("kale"), Err: errors.New("arbuz")}, + ctx: makeCtx("", t), + want: "Error: arbuz"}, + { // has pos, has context + err: config.BpError{Path: config.Root.Vars.Dot("kale"), Err: errors.New("arbuz")}, + ctx: makeCtx(` +vars: + kale: dos`, t), + want: `Error: arbuz +3: kale: dos + ^`}, + { + err: config.HintError{Hint: "did you mean 'kale'?", Err: errors.New("arbuz")}, + ctx: makeCtx("", t), + want: `Error: arbuz +Hint: did you mean 'kale'?`}, + { // has pos, has context + err: config.BpError{ + Path: config.Root.Vars.Dot("kale"), + Err: config.HintError{ + Hint: "did you mean 'kale'?", + Err: errors.New("arbuz")}}, + ctx: makeCtx(` +vars: + kale: dos`, t), + want: `Error: arbuz +Hint: did you mean 'kale'? +3: kale: dos + ^`}, + } + for _, tc := range tests { + t.Run(tc.want, func(t *testing.T) { + got := renderError(tc.err, tc.ctx) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/validators/cloud.go b/pkg/validators/cloud.go index 7b5f6576f8..78a0585f2b 100644 --- a/pkg/validators/cloud.go +++ b/pkg/validators/cloud.go @@ -42,9 +42,9 @@ func getErrorReason(err googleapi.Error) (string, map[string]interface{}) { } func newDisabledServiceError(title string, name string, pid string) error { - return hint( - fmt.Errorf("%s service is disabled in project %s", title, pid), - fmt.Sprintf("%s can be enabled at https://console.cloud.google.com/apis/library/%s?project=%s", title, name, pid)) + return config.HintError{ + Hint: fmt.Sprintf("%s can be enabled at https://console.cloud.google.com/apis/library/%s?project=%s", title, name, pid), + Err: fmt.Errorf("%s service is disabled in project %s", title, pid)} } func handleServiceUsageError(err error, pid string) error { diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index d0076060ab..57c39365cb 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -15,6 +15,7 @@ package validators import ( + "errors" "fmt" "hpc-toolkit/pkg/config" "strings" @@ -27,21 +28,17 @@ const regionError = "region %s is not available in project ID %s or your credent const zoneError = "zone %s is not available in project ID %s or your credentials do not have permission to access it" const zoneInRegionError = "zone %s is not in region %s in project ID %s or your credentials do not have permissions to access it" const unusedModuleMsg = "module %q uses module %q, but matching setting and outputs were not found. This may be because the value is set explicitly or set by a prior used module" +const credentialsHint = "load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform" + +var ErrNoDefaultCredentials = errors.New("could not find application default credentials") func handleClientError(e error) error { if strings.Contains(e.Error(), "could not find default credentials") { - return hint( - fmt.Errorf("could not find application default credentials"), - "load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform") + return config.HintError{Hint: credentialsHint, Err: ErrNoDefaultCredentials} } return e } -// TODO: use HintError trait once its implemented -func hint(err error, h string) error { - return fmt.Errorf("%w\n%s", err, h) -} - const ( testApisEnabledName = "test_apis_enabled" testProjectExistsName = "test_project_exists" From 30b557aeacfce28cee6016eb933dae41f6ebef42 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 10 Jan 2024 12:30:54 -0800 Subject: [PATCH 036/151] Rename slurm tpu test to be consistent with blueprint name --- tools/cloud-build/babysit_tests.py | 3 +++ tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml | 2 +- tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/babysit_tests.py b/tools/cloud-build/babysit_tests.py index 13edb7405d..25fa7267dc 100755 --- a/tools/cloud-build/babysit_tests.py +++ b/tools/cloud-build/babysit_tests.py @@ -76,6 +76,9 @@ def selector(build: Build) -> bool: "PR-test-hpc-slurm-chromedesktop", "PR-test-lustre-slurm", ]), + "slurm6": selector_by_name([ + "PR-test-slurm-gcp-v6-tpu", + ]), "spack": selector_by_name([ "PR-test-batch-mpi", "PR-test-spack-gromacs", diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml index 30daa86b8f..3139adc87c 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml @@ -36,7 +36,7 @@ steps: - echo "done fetching builder" ## Test Slurm v6 TPU example -- id: slurm-gcp-v6-tpu +- id: slurm6-tpu waitFor: ["fetch_builder", "build_ghpc"] name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder entrypoint: /bin/bash diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml index 8b58588ede..82f8427ff1 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml @@ -14,7 +14,7 @@ --- -test_name: hpc-slurm6-tpu +test_name: slurm6-tpu deployment_name: "v6-tpu-{{ build }}" # Manually adding the slurm_cluster_name for use in node names, which filters # non-alphanumeric chars and is capped at 10 chars. From 3543ffa89d9e17f7fb77b2e66d1e867a4dc745a7 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 10 Jan 2024 12:52:29 -0800 Subject: [PATCH 037/151] Add example script to uninstall Ops Agent and install Stackdriver Agent --- modules/scripts/startup-script/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index c27bba68ed..8b42319d90 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -141,6 +141,26 @@ better performance under some HPC workloads. While official documentation recommends using the _Cloud Ops Agent_, it is recommended to use `install_stackdriver_agent` when performance is important. +If an image or machine already has Cloud Ops Agent installed and you would like +to instead use the Stackdrier Agent, the following script will remove the Cloud +Ops Agent and install the Stackdriver Agent. + +```bash +# Remove Cloud Ops Agent +sudo systemctl stop google-cloud-ops-agent.service +sudo systemctl disable google-cloud-ops-agent.service +curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh +sudo bash add-google-cloud-ops-agent-repo.sh --uninstall +sudo bash add-google-cloud-ops-agent-repo.sh --remove-repo + +# Install Stackdriver Agent +curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh +sudo bash add-monitoring-agent-repo.sh --also-install +curl -sSO https://dl.google.com/cloudagents/add-logging-agent-repo.sh +sudo bash add-logging-agent-repo.sh --also-install +sudo service stackdriver-agent start +``` + ### Example ```yaml From 733d90dae4cb9484d35f2a0189e9f27bb53f9ef8 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 10 Jan 2024 16:25:04 -0600 Subject: [PATCH 038/151] Silence make error message for old versions of git Older versions of git do not have a '--show-current' flag on the git branch command. This command allows fallback to the ancient approach to determining the active branch and also redirects stderr to /dev/null. If neither command succeeds, then ghpc --version reports detached HEAD for the branch. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4efab7a3ac..d452cddf93 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ ifneq (, $(shell which git)) ifneq (,$(wildcard .git)) ## GIT DIRECTORY EXISTS GIT_TAG_VERSION=$(shell git tag --points-at HEAD) -GIT_BRANCH=$(shell git branch --show-current) +GIT_BRANCH=$(shell $(SHELL) -c 'git branch --show-current || git rev-parse --abbrev-ref HEAD' 2>/dev/null) GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long --always) GIT_COMMIT_HASH=$(shell git rev-parse HEAD) GIT_INITIAL_HASH=$(shell git rev-list --max-parents=0 HEAD) From 872727b9fbfabb52e63f5bbeb2e051af24e0230a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 10 Jan 2024 14:53:29 -0800 Subject: [PATCH 039/151] yamllint. Don't show warnings (#2122) Motivation: warnings doesn't cause lint to fail (only errors do), but they will be outputed along the errors (many lines), that makes it hard to see the actual error message --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3501b4095f..006143f19e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -89,7 +89,7 @@ repos: rev: v1.32.0 hooks: - id: yamllint - args: [-c=.yamllint] + args: [-c=.yamllint, --no-warnings] - repo: https://github.com/jackdewinter/pymarkdown rev: v0.9.12 hooks: From a75c5b50153f4c1f8eff606f2e345fda5d12dd9c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 10 Jan 2024 15:28:03 -0800 Subject: [PATCH 040/151] Move `examples/hpc-slurm` to V6 (#2097) pick f88a30f2 Unify usage and rendering of `HintError` * Move `examples/hpc-slurm` to V6; * Updated `examples/README`; * Remove `slurm-v5-hpc-centos7` test. --- community/examples/hpc-slurm6.yaml | 87 ------------------- .../schedmd-slurm-gcp-v5-login/README.md | 3 - examples/README.md | 36 +------- examples/hpc-slurm.yaml | 66 +++++++------- .../builds/slurm-gcp-v5-hpc-centos7.yaml | 54 ------------ .../tests/slurm-v5-hpc-centos7.yml | 43 --------- .../daily-tests/tests/slurm-v6-rocky8.yml | 3 +- 7 files changed, 37 insertions(+), 255 deletions(-) delete mode 100644 community/examples/hpc-slurm6.yaml delete mode 100644 tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml diff --git a/community/examples/hpc-slurm6.yaml b/community/examples/hpc-slurm6.yaml deleted file mode 100644 index cf6a15b072..0000000000 --- a/community/examples/hpc-slurm6.yaml +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm6 - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: slurm-gcp-v6 - region: us-west4 - zone: us-west4-c - instance_image: - family: slurm-gcp-6-1-hpc-rocky-linux-8 - project: schedmd-slurm-public - -deployment_groups: -- group: primary - modules: - - id: network - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network] - settings: - local_mount: /home - - - id: debug_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - node_count_dynamic_max: 4 - machine_type: n2-standard-2 - enable_placement: false # the default is: true - - - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [debug_nodeset, homefs] - settings: - partition_name: debug - exclusive: false # allows nodes to stay up after jobs are done - is_default: true - - - id: compute_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - name: ns2 - node_count_dynamic_max: 20 - bandwidth_tier: gvnic_enabled - - - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v6-partition - use: [compute_nodeset, homefs] - settings: - partition_name: compute - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login - use: [network] - settings: - name_prefix: login - machine_type: n2-standard-4 - disable_login_public_ips: false - - - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller - use: - - network - - debug_partition - - compute_partition - - slurm_login - - homefs - settings: - disable_controller_public_ips: false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 4833a999e5..09979f2320 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -28,9 +28,6 @@ This creates a Slurm login node which is: `use` * of VM machine type `n2-standard-4` -For a complete example using this module, see -[hpc-slurm.yaml](../../../../examples/hpc-slurm.yaml). - ## Custom Images For more information on creating valid custom images for the login node VM diff --git a/examples/README.md b/examples/README.md index 9b866bc960..b5f192ec09 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,7 +13,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Blueprint Descriptions](#blueprint-descriptions) * [hpc-slurm.yaml](#hpc-slurmyaml-) ![core-badge] * [hpc-enterprise-slurm.yaml](#hpc-enterprise-slurmyaml-) ![core-badge] - * [hpc-slurm6.yaml](#hpc-slurm6yaml--) ![community-badge] ![experimental-badge] * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] @@ -118,13 +117,11 @@ the experimental badge (![experimental-badge]). ### [hpc-slurm.yaml] ![core-badge] -> **Warning**: The variables `enable_reconfigure`, -> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to -> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**. +> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -265,35 +262,6 @@ to 256 [hpc-enterprise-slurm.yaml]: ./hpc-enterprise-slurm.yaml -### [hpc-slurm6.yaml] ![community-badge] ![experimental-badge] - -> **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. -> -> ```shell -> # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.2.1/scripts/requirements.txt -> ``` - -Creates a basic auto-scaling Slurm cluster with mostly default settings. The -blueprint also creates a new VPC network, and a filestore instance mounted to -`/home`. - -There are 2 partitions in this example: `debug`, and `compute`. The `debug` -partition uses `n2-standard-2` VMs, which should work out of the box without -needing to request additional quota. The purpose of the `debug` partition is to -make sure that first time users are not immediately blocked by quota -limitations. - -[hpc-slurm6.yaml]: ../community/examples/hpc-slurm6.yaml - -#### Compute Partition - -There is a `compute` partition that achieves higher performance. Any -performance analysis should be done on the `compute` partition. By default it -uses `c2-standard-60` VMs with placement groups enabled. You may need to request -additional quota for `C2 CPUs` in the region you are deploying in. You can -select the compute partition using the `-p compute` argument when running `srun`. - ### [hpc-slurm6-tpu.yaml] ![community-badge] ![experimental-badge] > **Warning**: Requires additional dependencies **to be installed on the system deploying the infrastructure**. diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index 439870b8fe..59edc0586b 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -18,7 +18,7 @@ blueprint_name: hpc-slurm vars: project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-small + deployment_name: hpc-slurm region: us-central1 zone: us-central1-a @@ -28,53 +28,54 @@ vars: deployment_groups: - group: primary modules: - # Source is an embedded resource, denoted by "resources/*" without ./, ../, / - # as a prefix. To refer to a local resource, prefix with ./, ../ or / - # Example - ./resources/network/vpc - - id: network1 + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - id: network source: modules/network/vpc - id: homefs source: modules/file-system/filestore - use: [network1] + use: [network] settings: local_mount: /home - - id: debug_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 + enable_placement: false # the default is: true - id: debug_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - debug_node_group + - debug_nodeset settings: partition_name: debug exclusive: false # allows nodes to stay up after jobs are done - enable_placement: false # the default is: true is_default: true - - id: compute_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - compute_node_group + - compute_nodeset settings: partition_name: compute - - id: h3_node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + - id: h3_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] settings: node_count_dynamic_max: 20 machine_type: h3-standard-88 @@ -84,30 +85,29 @@ deployment_groups: bandwidth_tier: gvnic_enabled - id: h3_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - - network1 - homefs - - h3_node_group + - h3_nodeset settings: partition_name: h3 + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - - network1 + - network - debug_partition - compute_partition - h3_partition - homefs + - slurm_login settings: disable_controller_public_ips: false - - - id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - settings: - machine_type: n2-standard-4 - disable_login_public_ips: false diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml deleted file mode 100644 index 3896883092..0000000000 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-hpc-centos7.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" -## Test Slurm v5 HPC Centos7 Example -- id: slurm-gcp-v5-hpc-centos7 - waitFor: ["fetch_builder", "build_ghpc"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml" diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml deleted file mode 100644 index fe646b74da..0000000000 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: hpc-slurm -deployment_name: "cent-v5-{{ build }}" -# Manually adding the slurm_cluster_name for use in node names, which filters -# non-alphanumeric chars and is capped at 10 chars. -slurm_cluster_name: "centv5{{ build[0:4] }}" -zone: us-west4-c -cli_deployment_vars: - enable_cleanup_compute: true - region: us-west4 - zone: "{{ zone }}" - zones: "[us-west4-a,us-west4-b,us-west4-c]" -workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" -network: "{{ deployment_name }}-net" -max_nodes: 5 -# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work. -login_node: "{{ slurm_cluster_name }}-login-*" -controller_node: "{{ slurm_cluster_name }}-controller" -post_deploy_tests: -- test-validation/test-mounts.yml -- test-validation/test-partitions.yml -custom_vars: - partitions: - - compute - - debug - mounts: - - /home diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml index b5f0a8655b..b77baf5382 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml @@ -26,13 +26,14 @@ cli_deployment_vars: zone: us-west4-c workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6.yaml" +blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" network: "{{ deployment_name }}-net" max_nodes: 5 # Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: +- test-validation/test-mounts.yml - test-validation/test-partitions.yml custom_vars: partitions: From 565b29f1947dcacc4aedb5652d02081df03c6ea9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 10 Jan 2024 15:32:21 -0800 Subject: [PATCH 041/151] Add `has_to_be_used` behaviour to some of modules (#2092) --- .../metadata.yaml | 2 ++ .../metadata.yaml | 1 + .../metadata.yaml | 2 ++ .../schedmd-slurm-gcp-v6-login/metadata.yaml | 2 ++ docs/module-guidelines.md | 5 +++- pkg/config/expand.go | 23 +++++++++++++++++++ pkg/modulereader/metadata.go | 2 ++ 7 files changed, 36 insertions(+), 1 deletion(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml index 929eeecaf0..95b6d1c730 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/metadata.yaml @@ -18,3 +18,4 @@ spec: services: [] ghpc: inject_module_id: name + has_to_be_used: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml index 641832182d..13ea127b3c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/metadata.yaml @@ -16,3 +16,5 @@ spec: requirements: services: [] +ghpc: + has_to_be_used: true diff --git a/docs/module-guidelines.md b/docs/module-guidelines.md index 0c95cba054..e8339663d1 100644 --- a/docs/module-guidelines.md +++ b/docs/module-guidelines.md @@ -201,5 +201,8 @@ spec: ghpc: # [optional] # [optional] `inject_module_id`, if set, will inject blueprint # module id as a value for the module variable `var_name`. - inject_module_id: var_name + inject_module_id: var_name + # [optional] `has_to_be_used` is a boolean flag, if set to true, + # the creation will fail if the module is not used. + has_to_be_used: true ``` diff --git a/pkg/config/expand.go b/pkg/config/expand.go index deccfe8e93..f9f5a0c87e 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -56,6 +56,10 @@ func (dc *DeploymentConfig) expand() error { return err } + if err := validateModulesAreUsed(dc.Config); err != nil { + return err + } + dc.Config.populateOutputs() return nil } @@ -114,6 +118,25 @@ func checkInputValueMatchesType(val cty.Value, input modulereader.VarInfo, bp Bl return nil } +func validateModulesAreUsed(bp Blueprint) error { + used := map[ModuleID]bool{} + bp.WalkModulesSafe(func(_ ModulePath, m *Module) { + for ref := range valueReferences(m.Settings.AsObject()) { + used[ref.Module] = true + } + }) + + errs := Errors{} + bp.WalkModulesSafe(func(p ModulePath, m *Module) { + if m.InfoOrDie().Metadata.Ghpc.HasToBeUsed && !used[m.ID] { + errs.At(p.ID, HintError{ + "you need to add it to the `use`-block of downstream modules", + fmt.Errorf("module %q was not used", m.ID)}) + } + }) + return errs.OrNil() +} + func (dc *DeploymentConfig) expandBackends() { // 1. DEFAULT: use TerraformBackend configuration (if supplied) in each // resource group diff --git a/pkg/modulereader/metadata.go b/pkg/modulereader/metadata.go index 15069f6c11..310ae74120 100644 --- a/pkg/modulereader/metadata.go +++ b/pkg/modulereader/metadata.go @@ -47,6 +47,8 @@ type MetadataGhpc struct { // Optional, set to the string-typed module variable name. // If set, the blueprint module id will be set as a value of this variable. InjectModuleId string `yaml:"inject_module_id"` + // If set to true, the creation will fail if the module is not used. + HasToBeUsed bool `yaml:"has_to_be_used"` } // GetMetadata reads and parses `metadata.yaml` from module root. From 0e79ce688f51ef105e9b087f63464ec4b9ceaa09 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 10 Jan 2024 19:01:41 -0800 Subject: [PATCH 042/151] Update README.md --- community/modules/file-system/DDN-EXAScaler/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 3206752cb8..f825d0e6d2 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -27,7 +27,7 @@ Toolkit, see the extended [Network Storage documentation](../../../../docs/netwo ## Mounting To mount the DDN EXAScaler Lustre file system you must first install the DDN -Luster client and then call the proper `mount` command. +Lustre client and then call the proper `mount` command. Both of these steps are automatically handled with the use of the `use` command in a selection of HPC Toolkit modules. See the [compatibility matrix][matrix] in From a637d162cf4aa72835329bd0fffa5607a662a124 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 11 Jan 2024 10:29:47 -0600 Subject: [PATCH 043/151] Reduce default maximum number of HTCondor execute points Especialy for initial deployments, a maximum of 100 could result in significant spend beyond what was anticipated. Reducing to 5 addresses this while still allowing the user to deliberately scale up. --- community/modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 10a566cbb9..c381921b1d 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -204,7 +204,7 @@ limitations under the License. | [instance\_image](#input\_instance\_image) | HTCondor execute point VM image

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to HTConodr execute points | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor execute points | `string` | `"n2-standard-4"` | no | -| [max\_size](#input\_max\_size) | Maximum size of the HTCondor execute point pool. | `number` | `100` | no | +| [max\_size](#input\_max\_size) | Maximum size of the HTCondor execute point pool. | `number` | `5` | no | | [metadata](#input\_metadata) | Metadata to add to HTCondor execute points | `map(string)` | `{}` | no | | [min\_idle](#input\_min\_idle) | Minimum number of idle VMs in the HTCondor pool (if pool reaches var.max\_size, this minimum is not guaranteed); set to ensure jobs beginning run more quickly. | `number` | `0` | no | | [name\_prefix](#input\_name\_prefix) | Name prefix given to hostnames in this group of execute points; must be unique across all instances of this module | `string` | n/a | yes | diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 2991b25dc2..178c786c26 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -133,7 +133,7 @@ variable "target_size" { variable "max_size" { description = "Maximum size of the HTCondor execute point pool." type = number - default = 100 + default = 5 } variable "min_idle" { From 4a2b7fad09c9df0df7b063fc463280f4162a5d58 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 11 Jan 2024 11:04:15 -0800 Subject: [PATCH 044/151] Hint spelling for inputs (#2124) --- pkg/config/validate.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 16929056a4..d3a8d06009 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" + "golang.org/x/exp/maps" ) const maxLabels = 64 @@ -163,7 +164,8 @@ func validateSettings( } // Setting not found if _, ok := cVars.Inputs[k]; !ok { - errs.At(sp, UnknownModuleSetting) + err := hintSpelling(k, maps.Keys(cVars.Inputs), UnknownModuleSetting) + errs.At(sp, err) continue // do not perform other validations } From fc021dde3a8d83e02c31777a87763eed2efca147 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 11 Jan 2024 12:40:11 -0800 Subject: [PATCH 045/151] Simplify rendering of errors with Position but without Path (#2096) * Remove `internalPath`; * Add `PosError` wrapper, render it specifically. --- cmd/render.go | 10 +++-- pkg/config/errors.go | 14 +++++++ pkg/config/path.go | 3 -- pkg/config/path_test.go | 5 --- pkg/config/yaml.go | 69 ++++++++++++++--------------------- pkg/config/yaml_test.go | 2 +- pkg/modulereader/resreader.go | 1 + 7 files changed, 50 insertions(+), 54 deletions(-) diff --git a/cmd/render.go b/cmd/render.go index 6c0013524a..769f975dd5 100644 --- a/cmd/render.go +++ b/cmd/render.go @@ -40,6 +40,8 @@ func renderError(err error, ctx config.YamlCtx) string { return renderHintError(te, ctx) case config.BpError: return renderBpError(te, ctx) + case config.PosError: + return renderPosError(te, ctx) default: return fmt.Sprintf("%s: %s", boldRed("Error"), err) } @@ -65,12 +67,14 @@ func renderHintError(err config.HintError, ctx config.YamlCtx) string { func renderBpError(err config.BpError, ctx config.YamlCtx) string { if pos, ok := findPos(err.Path, ctx); ok { - return renderPosError(err.Err, pos, ctx) + posErr := config.PosError{Pos: pos, Err: err.Err} + return renderPosError(posErr, ctx) } return renderError(err.Err, ctx) } -func renderPosError(err error, pos config.Pos, ctx config.YamlCtx) string { +func renderPosError(err config.PosError, ctx config.YamlCtx) string { + pos := err.Pos line := pos.Line - 1 if line < 0 || line >= len(ctx.Lines) { return renderError(err, ctx) @@ -83,5 +87,5 @@ func renderPosError(err error, pos config.Pos, ctx config.YamlCtx) string { arrow = spaces + "^" } - return fmt.Sprintf("%s\n%s%s\n%s", renderError(err, ctx), pref, ctx.Lines[line], arrow) + return fmt.Sprintf("%s\n%s%s\n%s", renderError(err.Err, ctx), pref, ctx.Lines[line], arrow) } diff --git a/pkg/config/errors.go b/pkg/config/errors.go index 18d7bb9be6..3694e831d1 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -35,6 +35,20 @@ func (e BpError) Unwrap() error { return e.Err } +// PosError is an error wrapper to augment Position +type PosError struct { + Pos Pos + Err error +} + +func (e PosError) Error() string { + return fmt.Sprintf("line %d column %d: %s", e.Pos.Line, e.Pos.Column, e.Err) +} + +func (e PosError) Unwrap() error { + return e.Err +} + // HintError wraps another error to suggest other values type HintError struct { Hint string diff --git a/pkg/config/path.go b/pkg/config/path.go index cf1a1df58b..c22de0aeb7 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -179,9 +179,6 @@ type outputPath struct { // Root is a starting point for creating a Blueprint Path var Root rootPath -// internalPath is to be used to report problems outside of Blueprint schema (e.g. YAML parsing error position) -var internalPath = mapPath[basePath]{basePath{nil, "__internal_path__"}} - func init() { initPath(&Root, nil, "") } diff --git a/pkg/config/path_test.go b/pkg/config/path_test.go index adc702f666..b17a0dc219 100644 --- a/pkg/config/path_test.go +++ b/pkg/config/path_test.go @@ -72,9 +72,6 @@ func TestPath(t *testing.T) { {r.Backend.Type, "terraform_backend_defaults.type"}, {r.Backend.Configuration, "terraform_backend_defaults.configuration"}, {r.Backend.Configuration.Dot("goo"), "terraform_backend_defaults.configuration.goo"}, - - {internalPath, "__internal_path__"}, - {internalPath.Dot("a"), "__internal_path__.a"}, } for _, tc := range tests { t.Run(tc.want, func(t *testing.T) { @@ -103,8 +100,6 @@ func TestPathParent(t *testing.T) { {r.Vars.Dot("red").Cty(cp.IndexInt(6)), r.Vars.Dot("red")}, {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg")), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg").Index(cty.True)), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, - {internalPath, nil}, - {internalPath.Dot("gold"), internalPath}, } for _, tc := range tests { t.Run(tc.p.String(), func(t *testing.T) { diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index 4d0fde2d7b..77f67c2d1a 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -68,15 +68,7 @@ func importBlueprint(f string) (Blueprint, YamlCtx, error) { var bp Blueprint if err = decoder.Decode(&bp); err != nil { - errs := Errors{} - for i, yep := range parseYamlV3Error(err) { - path := internalPath.Dot(fmt.Sprintf("bp_schema_error_%d", i)) - if yep.pos.Line != 0 { - yamlCtx.pathToPos[yPath(path.String())] = yep.pos - } - errs.At(path, errors.New(yep.errMsg)) - } - return Blueprint{}, yamlCtx, errs + return Blueprint{}, yamlCtx, parseYamlV3Error(err) } return bp, yamlCtx, nil } @@ -148,15 +140,7 @@ func NewYamlCtx(data []byte) (YamlCtx, error) { // error may happen if YAML is not valid, regardless of Blueprint schema if err := yaml.Unmarshal(data, &c); err != nil { - errs := Errors{} - for i, yep := range parseYamlV3Error(err) { - path := internalPath.Dot(fmt.Sprintf("yaml_error_%d", i)) - if yep.pos.Line != 0 { - m[yPath(path.String())] = yep.pos - } - errs.At(path, errors.New(yep.errMsg)) - } - return YamlCtx{m, lines}, errs + return YamlCtx{m, lines}, parseYamlV3Error(err) } var walk func(n *yaml.Node, p yPath, posOf *yaml.Node) @@ -186,6 +170,10 @@ func NewYamlCtx(data []byte) (YamlCtx, error) { type nodeCapturer struct{ n *yaml.Node } +func nodeToPosErr(n *yaml.Node, err error) PosError { + return PosError{Pos{Line: n.Line, Column: n.Column}, err} +} + func (c *nodeCapturer) UnmarshalYAML(n *yaml.Node) error { c.n = n return nil @@ -199,7 +187,7 @@ func (mk *ModuleKind) UnmarshalYAML(n *yaml.Node) error { mk.kind = kind return nil } - return fmt.Errorf("line %d: kind must be \"packer\" or \"terraform\" or removed from YAML", n.Line) + return nodeToPosErr(n, errors.New(`kind must be "packer" or "terraform" or removed from YAML`)) } // MarshalYAML implements a custom marshaler from ModuleKind to YAML string @@ -211,7 +199,7 @@ func (mk ModuleKind) MarshalYAML() (interface{}, error) { func (ms *ModuleIDs) UnmarshalYAML(n *yaml.Node) error { var ids []ModuleID if err := n.Decode(&ids); err != nil { - return fmt.Errorf("line %d: `use` must be a list of module ids", n.Line) + return nodeToPosErr(n, errors.New("`use` must be a list of module ids")) } *ms = ids return nil @@ -247,7 +235,7 @@ func (y *YamlValue) UnmarshalYAML(n *yaml.Node) error { case yaml.SequenceNode: err = y.unmarshalTuple(n) default: - err = fmt.Errorf("line %d: cannot decode node with unknown kind %d", n.Line, n.Kind) + err = nodeToPosErr(n, fmt.Errorf("cannot decode node with unknown kind %d", n.Kind)) } return err } @@ -259,7 +247,7 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { } ty, err := gocty.ImpliedType(s) if err != nil { - return fmt.Errorf("line %d: %w", n.Line, err) + return nodeToPosErr(n, err) } v, err := gocty.ToCtyValue(s, ty) if err != nil { @@ -271,14 +259,14 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { var e Expression if e, err = ParseExpression(l); err != nil { // TODO: point to exact location within expression, see Diagnostic.Subject - return fmt.Errorf("line %d: %w", n.Line, err) + return nodeToPosErr(n, err) } y.Wrap(e.AsValue()) } else if y.Unwrap().Type() == cty.String && hasVariable(y.Unwrap().AsString()) { // "simple" variable e, err := SimpleVarToExpression(y.Unwrap().AsString()) if err != nil { // TODO: point to exact location within expression, see Diagnostic.Subject - return fmt.Errorf("line %d: %w", n.Line, err) + return nodeToPosErr(n, err) } y.Wrap(e.AsValue()) } @@ -319,7 +307,7 @@ func (d *Dict) UnmarshalYAML(n *yaml.Node) error { } ty := v.Unwrap().Type() if !ty.IsObjectType() { - return fmt.Errorf("line %d: must be a mapping, got %s", n.Line, ty.FriendlyName()) + return nodeToPosErr(n, fmt.Errorf("must be a mapping, got %s", ty.FriendlyName())) } for k, w := range v.Unwrap().AsValueMap() { d.Set(k, w) @@ -350,40 +338,37 @@ func (d Dict) MarshalYAML() (interface{}, error) { return g, nil } -type yamlErrWithPos struct { - pos Pos - errMsg string -} - // yaml.v3 errors are either TypeError - collection of error message or single error message. // Parse error messages to extract short error message and position. -func parseYamlV3Error(err error) []yamlErrWithPos { - res := []yamlErrWithPos{} +func parseYamlV3Error(err error) error { + errs := Errors{} switch err := err.(type) { case *yaml.TypeError: for _, s := range err.Errors { - res = append(res, parseYamlV3ErrorString(s)) + errs.Add(parseYamlV3ErrorString(s)) } + case PosError: + errs.Add(err) default: - res = append(res, parseYamlV3ErrorString(err.Error())) + errs.Add(parseYamlV3ErrorString(err.Error())) } - if len(res) == 0 { // should never happen - res = append(res, parseYamlV3ErrorString(err.Error())) + if !errs.Any() { // should never happen + errs.Add(parseYamlV3ErrorString(err.Error())) } - return res + return errs } // parseYamlV3Error attempts to extract position and nice error message from yaml.v3 error message. // yaml.v3 errors are unstructured, use string parsing to extract information. -// If no position can be extracted, returns (Pos{}, error.Error()). -// Else returns (Pos{Line: line_number}, error_message). -func parseYamlV3ErrorString(s string) yamlErrWithPos { +// If no position can be extracted, returns error without position. +// Else returns PosError{Pos{Line: line_number}, error_message}. +func parseYamlV3ErrorString(s string) error { match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?(.*)$`).FindStringSubmatch(s) if match == nil { - return yamlErrWithPos{Pos{}, s} + return errors.New(s) } lns, errMsg := match[3], match[4] ln, _ := strconv.Atoi(lns) // Atoi returns 0 on error, which is fine here - return yamlErrWithPos{Pos{Line: ln}, errMsg} + return PosError{Pos{Line: ln}, errors.New(errMsg)} } diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go index 75aa8242c7..46b704fcb8 100644 --- a/pkg/config/yaml_test.go +++ b/pkg/config/yaml_test.go @@ -260,7 +260,7 @@ func TestDictWrongTypeUnmarshalYAML(t *testing.T) { if err == nil { t.Errorf("expected error, got nil") } - if diff := cmp.Diff(err.Error(), "line 2: must be a mapping, got number"); diff != "" { + if diff := cmp.Diff(err.Error(), "line 2 column 1: must be a mapping, got number"); diff != "" { t.Errorf("diff (-want +got):\n%s", diff) } } diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index be054563a6..141157d16f 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -48,6 +48,7 @@ type OutputInfo struct { // UnmarshalYAML supports parsing YAML OutputInfo fields as a simple list of // strings or as a list of maps directly into OutputInfo struct +// TODO: unmarshal logic shouldn't be defined in this package, move to pkg/config func (mo *OutputInfo) UnmarshalYAML(value *yaml.Node) error { var name string const yamlErrorMsg string = "block beginning at line %d: %s" From 1b6df57a07b2bd926f87bb9d100857489c23245c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 11 Jan 2024 22:02:59 +0000 Subject: [PATCH 046/151] Bump jinja2 from 3.1.2 to 3.1.3 in /community/front-end/ofe Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.2 to 3.1.3. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.2...3.1.3) --- updated-dependencies: - dependency-name: jinja2 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 9624308827..1510010985 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -46,7 +46,7 @@ identify==2.5.24 idna==3.4 importlib-resources==6.1.1 isort==5.12.0 -Jinja2==3.1.2 +Jinja2==3.1.3 jsonschema==4.20.0 jsonschema-specifications==2023.11.1 lazy-object-proxy==1.9.0 From d2f52ecfa97ed885bbc5d18e4718fbaa5949f3fd Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 11 Jan 2024 18:18:58 -0800 Subject: [PATCH 047/151] Address #2120: fail on bad state and succeed on reinstall --- .../files/install_monitoring_agent.sh | 20 +++++++++++++++---- modules/scripts/startup-script/main.tf | 4 ++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/modules/scripts/startup-script/files/install_monitoring_agent.sh b/modules/scripts/startup-script/files/install_monitoring_agent.sh index 2a48f310f5..569ed478c6 100644 --- a/modules/scripts/startup-script/files/install_monitoring_agent.sh +++ b/modules/scripts/startup-script/files/install_monitoring_agent.sh @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +set -e -o pipefail LEGACY_MONITORING_PACKAGE='stackdriver-agent' LEGACY_MONITORING_SCRIPT_URL='https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh' @@ -21,7 +22,7 @@ LEGACY_LOGGING_SCRIPT_URL='https://dl.google.com/cloudagents/add-logging-agent-r OPSAGENT_PACKAGE='google-cloud-ops-agent' OPSAGENT_SCRIPT_URL='https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh' -install_legacy="${1:-true}" +ops_or_legacy="${1:-legacy}" fail() { echo >&2 "[$(date +'%Y-%m-%dT%H:%M:%S%z')] $*" @@ -113,11 +114,22 @@ main() { fail "Unsupported platform." fi - if is_legacy_installed || is_opsagent_installed; then - fail "Legacy (stackdriver) or Ops Agent is already installed." + # Handle cases that agent is already installed + if [[ -z "$(is_legacy_monitoring_installed)" && -n $(is_legacy_logging_installed) ]] || + [[ -n "$(is_legacy_monitoring_installed)" && -z $(is_legacy_logging_installed) ]]; then + fail "Bad state: legacy agent is partially installed" + elif [[ "${ops_or_legacy}" == "legacy" ]] && is_legacy_installed; then + echo "Legacy agent is already installed" + exit 0 + elif [[ "${ops_or_legacy}" != "legacy" ]] && is_opsagent_installed; then + echo "Ops agent is already installed" + exit 0 + elif is_legacy_installed || is_opsagent_installed; then + fail "Agent is already installed but does not match requested agent of ${ops_or_legacy}" fi - if [[ "${install_legacy}" == true ]]; then + # install agent + if [[ "${ops_or_legacy}" == "legacy" ]]; then echo "Installing legacy monitoring agent (stackdriver)" install_stackdriver_agent else diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index dc6b07af2a..42cc8791ac 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -26,7 +26,7 @@ locals { type = "shell" source = "${path.module}/files/install_monitoring_agent.sh" destination = "install_monitoring_agent_automatic.sh" - args = var.install_cloud_ops_agent ? "false" : "true" # install legacy (stackdriver) + args = var.install_cloud_ops_agent ? "ops" : "legacy" # install legacy (stackdriver) }] : [] ) @@ -175,7 +175,7 @@ resource "google_storage_bucket_object" "scripts" { lifecycle { precondition { - condition = !var.install_cloud_ops_agent || !var.install_stackdriver_agent + condition = !(var.install_cloud_ops_agent && var.install_stackdriver_agent) error_message = "Only one of var.install_stackdriver_agent or var.install_cloud_ops_agent can be set. Stackdriver is recommended for best performance." } } From 19c45949ed244057319562511b465009e61fd882 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 11 Jan 2024 21:05:59 -0800 Subject: [PATCH 048/151] Fix rendering of "cobra" errors (#2130) --- cmd/root.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 9583af298f..224776184a 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -250,7 +250,6 @@ func execPath() string { // NOTE: this function uses empty YamlCtx, so if you have one, use renderError directly. func checkErr(err error) { if err != nil { - msg := fmt.Sprintf("%s: %s", boldRed("Error"), renderError(err, config.YamlCtx{})) - logging.Fatal(msg) + logging.Fatal(renderError(err, config.YamlCtx{})) } } From c3746404ce4f78a3913a50c76ad80ef133ee6874 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 12 Jan 2024 10:14:05 -0800 Subject: [PATCH 049/151] Move `OFE venv` PR validation into separate trigger. (#2128) **Motivation**: * Reduce time it takes to run `PR-validation`; * Reduce noise in output of `PR-validation`. --- tools/cloud-build/README.md | 3 +- .../hpc-toolkit-pr-validation.yaml | 14 -------- tools/cloud-build/pr-ofe.yaml | 35 +++++++++++++++++++ tools/cloud-build/provision/README.md | 1 + tools/cloud-build/provision/pr-ofe.tf | 30 ++++++++++++++++ 5 files changed, 68 insertions(+), 15 deletions(-) create mode 100644 tools/cloud-build/pr-ofe.yaml create mode 100644 tools/cloud-build/provision/pr-ofe.tf diff --git a/tools/cloud-build/README.md b/tools/cloud-build/README.md index e55f5d77e7..468dcb96a2 100644 --- a/tools/cloud-build/README.md +++ b/tools/cloud-build/README.md @@ -11,9 +11,10 @@ * `Dockerfile`: Defines the HPC Toolkit docker image used in testing. * `hpc-toolkit-builder.yaml`: Cloud build config for running regular builds of the HPC Toolkit docker image. -* `hpc-toolkit-pr-validation.yaml`: Cloud build config for the PR validition +* `hpc-toolkit-pr-validation.yaml`: Cloud build config for the PR validation tests. The PR validation run `make tests` and validates against all pre-commits on all files. +* `pr-ofe.yaml`: Cloud build config for sanity test installing the OFE virtual environment. * `project-cleanup.yaml`: Cloud build config that performs a regular cleanup of resources in the test project. * `provision`: Terraform module that sets up CloudBuild triggers and schedule. diff --git a/tools/cloud-build/hpc-toolkit-pr-validation.yaml b/tools/cloud-build/hpc-toolkit-pr-validation.yaml index 2b03d031b0..ed49ae4b9c 100644 --- a/tools/cloud-build/hpc-toolkit-pr-validation.yaml +++ b/tools/cloud-build/hpc-toolkit-pr-validation.yaml @@ -50,20 +50,6 @@ steps: set -e export PROJECT=build-project time make tests -- id: ofe-virtual-env - waitFor: - - git-fetch-unshallow - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - | - set -e - python3 -m venv /opt/ofe - source /opt/ofe/bin/activate - pip install --upgrade pip - pip install --dry-run --no-cache-dir -r community/front-end/ofe/requirements.txt timeout: "1200s" options: machineType: N1_HIGHCPU_8 diff --git a/tools/cloud-build/pr-ofe.yaml b/tools/cloud-build/pr-ofe.yaml new file mode 100644 index 0000000000..eea5d375a3 --- /dev/null +++ b/tools/cloud-build/pr-ofe.yaml @@ -0,0 +1,35 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +steps: +- id: git-fetch-unshallow + name: gcr.io/cloud-builders/git + args: ['fetch', '--unshallow'] +- id: ofe-virtual-env + waitFor: [git-fetch-unshallow] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - '-c' + - | + set -e + python3 -m venv /opt/ofe + source /opt/ofe/bin/activate + pip install --upgrade pip + pip install --dry-run --no-cache-dir -r community/front-end/ofe/requirements.txt +timeout: "1200s" +options: + machineType: N1_HIGHCPU_8 diff --git a/tools/cloud-build/provision/README.md b/tools/cloud-build/provision/README.md index 634c933f22..275d4939b4 100644 --- a/tools/cloud-build/provision/README.md +++ b/tools/cloud-build/provision/README.md @@ -48,6 +48,7 @@ When prompted for project, use integration test project. | [google_cloudbuild_trigger.daily_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_go_build_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_ofe_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | +| [google_cloudbuild_trigger.pr_ofe_venv](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_validation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.weekly_build_dependency_check](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | diff --git a/tools/cloud-build/provision/pr-ofe.tf b/tools/cloud-build/provision/pr-ofe.tf new file mode 100644 index 0000000000..6663f265a6 --- /dev/null +++ b/tools/cloud-build/provision/pr-ofe.tf @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +resource "google_cloudbuild_trigger" "pr_ofe_venv" { + name = "PR-ofe-venv" + description = "Sanity test installing the OFE virtual environment" + + filename = "tools/cloud-build/pr-ofe.yaml" + + github { + owner = "GoogleCloudPlatform" + name = "hpc-toolkit" + pull_request { + branch = ".*" + comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" + } + } + include_build_logs = "INCLUDE_BUILD_LOGS_WITH_STATUS" +} From 1d1168267686113417469d4a1404b81ca5611b8a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 12 Jan 2024 10:14:44 -0800 Subject: [PATCH 050/151] Make `cleanup_compute_nodes` `depends_on` on network (#2126) --- .../schedmd-slurm-gcp-v6-controller/controller.tf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index a31c765135..9ef0b3f790 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -106,8 +106,7 @@ module "slurm_controller_instance" { static_ips = var.static_ips subnetwork = var.subnetwork_self_link zone = var.zone - - metadata = var.metadata + metadata = var.metadata depends_on = [ module.slurm_files, @@ -155,6 +154,11 @@ module "cleanup_compute_nodes" { slurm_cluster_name = local.slurm_cluster_name project_id = var.project_id when_destroy = true + + # Depend on controller network, as a best effort to avoid + # subnetwork resourceInUseByAnotherResource error + # NOTE: Can not use nodeset subnetworks as "A static list expression is required" + depends_on = [var.subnetwork_self_link] } From 7a33133e8ee581bdd1cc813abaaad547d849b01b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 12 Jan 2024 17:28:16 +0000 Subject: [PATCH 051/151] Update spack wrf example and references to use Slurm V6 --- docs/tutorials/wrfv3/spack-wrfv3.md | 65 ++++++++++++--------------- docs/tutorials/wrfv3/spack-wrfv3.yaml | 52 +++++++++++---------- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/docs/tutorials/wrfv3/spack-wrfv3.md b/docs/tutorials/wrfv3/spack-wrfv3.md index 275af22cff..db39790670 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.md +++ b/docs/tutorials/wrfv3/spack-wrfv3.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the Weather Research and Forecasting (WRF) Model application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 2 hr. to complete, -of which 1.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 2 hr. to complete, +of which 1.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -84,7 +84,6 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster - * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -106,24 +105,18 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-wrfv3/primary init -terraform -chdir=spack-wrfv3/primary apply +./ghpc deploy spack-wrfv3 ``` -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. +You can also use below command to generate a plan that describes the Google Cloud resources that will be deployed. -If the `apply` is successful, a message similar to the following will be -displayed: +```bash +terraform -chdir=spack-wrfv3/primary init +terraform -chdir=spack-wrfv3/primary apply +``` @@ -144,30 +137,30 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-wrfv3-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackwrfv3-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-wrfv3-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackwrfv3-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-wrfv3-controller` and `slurm-spack-wrfv3-login0`. If you don't +`spackwrfv3-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the login node +## Connecting to the controller node -Once the startup script has completed, connect to the login node. +Once the startup script has completed, connect to the controller node. -Use the following command to ssh into the login node from cloud shell: +Use the following command to ssh into the controller node from cloud shell: ```bash -gcloud compute ssh slurm-spack-wrfv3-login0 --zone us-central1-c --project +gcloud compute ssh spackwrfv3-controller --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,15 +184,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-wrfv3-login0` +1. Click on the `SSH` button associated with the `spackwrfv3-controller` instance. This will open a separate pop up window with a terminal into our newly - created Slurm login VM. + created Slurm controller VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm login node.** + **The commands below should be run on the Slurm controller node.** We will use the submission script (see line 122 of the blueprint) to submit a Weather Research and Forecasting (WRF) Model job. @@ -213,7 +206,7 @@ Weather Research and Forecasting (WRF) Model job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/wrfv3/submit_wrfv3.sh + sbatch /opt/apps/wrfv3/submit_wrfv3.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +220,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-wrfv3-compute-0-0`. +`spackwrfv3-compute-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -247,7 +240,7 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `rsl.out.0000` file has information on the run. You can view this file by -running the following command on the login node: +running the following command on the controller node: ```bash cat rsl.out.0000 @@ -268,9 +261,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the login node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash @@ -280,7 +273,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-wrfv3/primary destroy -auto-approve +./ghpc destroy spack-wrfv3 ``` When complete you should see something like: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 921669ce33..d9af2e8695 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -88,7 +88,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -107,58 +107,62 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: wrfv3_setup.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate wrfv3 - chmod -R a+rwX /apps/spack/var/spack/environments/wrfv3 - mkdir -p /apps/wrfv3 - chmod a+rwx /apps/wrfv3 - cd /apps/wrfv3 + chmod -R a+rwX /opt/apps/spack/var/spack/environments/wrfv3 + mkdir -p /opt/apps/wrfv3 + chmod a+rwx /opt/apps/wrfv3 + cd /opt/apps/wrfv3 wget --no-verbose https://www2.mmm.ucar.edu/wrf/bench/conus12km_v3911/bench_12km.tar.bz2 tar xjf bench_12km.tar.bz2 - type: data - destination: /apps/wrfv3/submit_wrfv3.sh + destination: /opt/apps/wrfv3/submit_wrfv3.sh content: | #!/bin/bash #SBATCH -N 2 #SBATCH --ntasks-per-node 30 - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate wrfv3 # Check that wrf.exe exists which wrf.exe cd $SLURM_SUBMIT_DIR - cp /apps/wrfv3/bench_12km/* . + cp /opt/apps/wrfv3/bench_12km/* . WRF=`spack location -i wrf` ln -s $WRF/run/* . scontrol show hostnames ${SLURM_JOB_NODELIST} > hostfile mpirun -n 60 -hostfile hostfile -ppn ${SLURM_NTASKS_PER_NODE} wrf.exe + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition settings: + disable_controller_public_ips: false + controller_startup_scripts_timeout: 21600 controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller From 9b91f43bbfad7f946a3f93747543981669590d04 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 12 Jan 2024 04:43:09 +0000 Subject: [PATCH 052/151] Update spack openfoam example to use /opt/apps directory --- docs/tutorials/openfoam/spack-openfoam.md | 14 +++++++------- docs/tutorials/openfoam/spack-openfoam.yaml | 11 +++++------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index 4342ab8a5e..2fc4d51387 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -135,16 +135,16 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-openfoam-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackopenf-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-openfoam-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackopenf-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-openfoam-controller`. If you don't +`spackopenf-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text @@ -204,7 +204,7 @@ OpenFOAM job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/openfoam/submit_openfoam.sh + sbatch /opt/apps/openfoam/submit_openfoam.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -218,7 +218,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-openfoam-compute-0-0`. +`spackopenf-comput-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -271,7 +271,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc deploy spack-openfoam +./ghpc destroy spack-openfoam ``` When complete you should see something like: diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index c15851fe17..bd2ec7dc70 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -35,7 +35,7 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack + install_dir: /opt/apps/spack spack_ref: v0.20.0 - id: spack-execute @@ -95,7 +95,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -124,17 +124,16 @@ deployment_groups: destination: setup_openfoam.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate openfoam - chmod -R a+rwX /apps/spack/var/spack/environments/openfoam - type: data - destination: /apps/openfoam/submit_openfoam.sh + destination: /opt/apps/openfoam/submit_openfoam.sh content: | #!/bin/bash #SBATCH -N 2 #SBATCH --ntasks-per-node 30 - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate openfoam cd $SLURM_SUBMIT_DIR From aa06799800629991b1c4b4651765f3adbf7fb788 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 13 Jan 2024 22:35:14 -0800 Subject: [PATCH 053/151] Improve readability of "required setting is missing" error (#2133) --- pkg/config/errors.go | 1 - pkg/config/expand.go | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/config/errors.go b/pkg/config/errors.go index 3694e831d1..3af20a874d 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -163,7 +163,6 @@ const ( errMsgFileLoadError = string("failed to read the input yaml") errMsgYamlMarshalError = string("failed to export the configuration to a blueprint yaml file") errMsgYamlSaveError = string("failed to write the expanded yaml") - errMsgMissingSetting = string("a required setting is missing from a module") errMsgInvalidVar = string("invalid variable definition in") errMsgVarNotFound = string("could not find source of variable") errMsgIntergroupOrder = string("references to outputs from other groups must be to earlier groups") diff --git a/pkg/config/expand.go b/pkg/config/expand.go index f9f5a0c87e..3431b7917d 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -80,8 +80,7 @@ func validateModuleInputs(mp ModulePath, m Module, bp Blueprint) error { if !m.Settings.Has(input.Name) { if input.Required { - errs.At(ip, fmt.Errorf("%s: Module ID: %s Setting: %s", - errMsgMissingSetting, m.ID, input.Name)) + errs.At(ip, fmt.Errorf("a required setting %q is missing from a module %q", input.Name, m.ID)) } continue } From 99ed6bb667fca9aaf9a1a3558acd8cce5d7d7435 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Mon, 15 Jan 2024 13:58:14 +0000 Subject: [PATCH 054/151] GKE controller node pool extra features --- .../modules/scheduler/gke-cluster/README.md | 3 +++ .../modules/scheduler/gke-cluster/main.tf | 7 ++++--- .../scheduler/gke-cluster/variables.tf | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/community/modules/scheduler/gke-cluster/README.md b/community/modules/scheduler/gke-cluster/README.md index a5a3b6dff0..e90db0d7e8 100644 --- a/community/modules/scheduler/gke-cluster/README.md +++ b/community/modules/scheduler/gke-cluster/README.md @@ -140,7 +140,10 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the system node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [services\_ip\_range\_name](#input\_services\_ip\_range\_name) | The name of the secondary subnet range to use for services. | `string` | `"services"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to host the cluster in. | `string` | n/a | yes | +| [system\_node\_pool\_enable\_secure\_boot](#input\_system\_node\_pool\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [system\_node\_pool\_enabled](#input\_system\_node\_pool\_enabled) | Create a system node pool. | `bool` | `true` | no | +| [system\_node\_pool\_image\_type](#input\_system\_node\_pool\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | +| [system\_node\_pool\_kubernetes\_labels](#input\_system\_node\_pool\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | | [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | diff --git a/community/modules/scheduler/gke-cluster/main.tf b/community/modules/scheduler/gke-cluster/main.tf index 5237f94b71..54d3f271c1 100644 --- a/community/modules/scheduler/gke-cluster/main.tf +++ b/community/modules/scheduler/gke-cluster/main.tf @@ -195,6 +195,7 @@ resource "google_container_node_pool" "system_node_pools" { } node_config { + labels = var.system_node_pool_kubernetes_labels resource_labels = local.labels service_account = var.service_account_email oauth_scopes = var.service_account_scopes @@ -209,15 +210,15 @@ resource "google_container_node_pool" "system_node_pools" { # # We use COS_CONTAINERD to be compatible with (optional) gVisor. # https://cloud.google.com/kubernetes-engine/docs/how-to/sandbox-pods - image_type = "COS_CONTAINERD" + image_type = var.system_node_pool_image_type shielded_instance_config { - enable_secure_boot = true + enable_secure_boot = var.system_node_pool_enable_secure_boot enable_integrity_monitoring = true } gvnic { - enabled = true + enabled = var.system_node_pool_image_type == "COS_CONTAINERD" } # Implied by Workload Identity diff --git a/community/modules/scheduler/gke-cluster/variables.tf b/community/modules/scheduler/gke-cluster/variables.tf index 4b12f14852..5ace7cae91 100644 --- a/community/modules/scheduler/gke-cluster/variables.tf +++ b/community/modules/scheduler/gke-cluster/variables.tf @@ -171,6 +171,25 @@ variable "system_node_pool_taints" { }] } +variable "system_node_pool_kubernetes_labels" { + description = <<-EOT + Kubernetes labels to be applied to each node in the node group. Key-value pairs. + (The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) + EOT + type = map(string) + default = null +} +variable "system_node_pool_image_type" { + description = "The default image type used by NAP once a new node pool is being created. Use either COS_CONTAINERD or UBUNTU_CONTAINERD." + type = string + default = "COS_CONTAINERD" +} +variable "system_node_pool_enable_secure_boot" { + description = "Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info." + type = bool + default = true +} + variable "enable_private_nodes" { description = "(Beta) Whether nodes have internal IP addresses only." type = bool From 061391c31394031d671a66265e853f57961e92c0 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 09:02:14 -0600 Subject: [PATCH 055/151] Improve MIG replacement policies for HTCondor Central Managers Set the MIG replacement policy to PROACTIVE by default for Central Managers. This ensures that configuration changes are propagated by a terraform apply which updates the HTCondor configuration. This is safe for Central Managers because they recover state dynamically through periodic API calls to the rest of the cluster. Document the alternative of OPPORTUNISTIC updates and how to manually trigger a MIG replacement. --- .../scheduler/htcondor-central-manager/README.md | 14 ++++++++++---- .../scheduler/htcondor-central-manager/main.tf | 6 +++--- .../htcondor-central-manager/variables.tf | 10 ++++++++++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 57186e9ea8..60d50a43df 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -31,9 +31,14 @@ A regional [MIG][mig] is used to provision the central manager, although only in any of the zones available in that region, however, it can be constrained to run in fewer zones (or a single zone) using [var.zones](#input_zones). -The VM replacement policy is set to [opportunistic]. In practice, this means -that an active VM will not be replaced by Terraform actions, but may be -replaced when either: +By default, the VM replacement policy is set to [proactive]. In practice, this +means that the Central Manager will be replaced by Terraform when changes to +the instance template / HTCondor configuration are made. The Central Manager is +safe to replace automatically as it gathers its state information from periodic +messages exchanged with the rest of the HTCondor pool. + +This mode can be switched to "OPPORTUNISTIC" by setting [var.update_policy][#input_update_policy]. +In this case, the Central Manager will be replaced only when: - intentionally by issuing an update via Cloud Console or using gcloud (below) - the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular @@ -47,7 +52,7 @@ gcloud compute instance-groups managed update-instances \ --project <> --minimal-action replace ``` -[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type +[proactive]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type ## Limiting inter-zone egress @@ -135,6 +140,7 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Central Manager ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle). | `string` | `"PROACTIVE"` | no | | [zones](#input\_zones) | Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 66d78c4059..b4c03acf77 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -177,14 +177,14 @@ module "htcondor_cm" { update_policy = [{ instance_redistribution_type = "NONE" - replacement_method = "SUBSTITUTE" - max_surge_fixed = length(local.zones) + replacement_method = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement) + max_surge_fixed = 0 # must be 0 to preserve hostnames max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] stateful_ips = [{ diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 885df7567b..b99d29c779 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -169,3 +169,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Central Manager (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)." + type = string + default = "PROACTIVE" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} From aa101112f71963172aad30a1097296c6c64b2a18 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 09:02:15 -0600 Subject: [PATCH 056/151] Improve MIG replacement policies for HTCondor Access Points Allow configuration of the MIG replacement policy for Access Points. Document the behavior of OPPORTUNISTIC updates and how to manually trigger a MIG replacement or set to the alternative of PROACTIVE replacements. --- .../scheduler/htcondor-access-point/README.md | 58 +++++++++++++++++++ .../scheduler/htcondor-access-point/main.tf | 6 +- .../htcondor-access-point/variables.tf | 10 ++++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index c3711154ab..3a2acac2cc 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -27,6 +27,63 @@ the functionality in these references. Their usage is demonstrated in the [htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md [IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens +## Behavior of Managed Instance Group (MIG) + +A regional [MIG][mig] is used to provision the Access Point, although only +1 node will ever be active at a time. By default, the node will be provisioned +in any of the zones available in that region, however, it can be constrained to +run in fewer zones (or a single zone) using [var.zones](#input_zones). + +By default, the VM replacement policy is set to [opportunistic]. In practice, +this means that the Access Point will _NOT_ be automatically replaced by +Terraform when changes to the instance template / HTCondor configuration are +made. The Access Point is _NOT_ safe to replace automatically as its local storage +contains the state of the job queue. By default, the Access Point will be +replaced only when: + +- intentionally by issuing an update via Cloud Console or using gcloud (below) +- the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular + Google Cloud maintenance) + +For example, to manually update all instances in a MIG: + +```text +gcloud compute instance-groups managed update-instances \ + <> --all-instances --region <> \ + --project <> --minimal-action replace +``` + +This mode can be switched to "PROACTIVE" (automatic) replacement by setting +[var.update_policy][#input_update_policy]. In this case we recommend the use of +Filestore to store the job queue state ("spool") and setting +[var.spool_parent_dir][#input_spool_parent_dir] to its mount point: + +```yaml + - id: spoolfs + source: modules/file-system/filestore + use: + - network1 + settings: + filestore_tier: ENTERPRISE + local_mount: /shared + +... + + - id: htcondor_access + source: community/modules/scheduler/htcondor-access-point + use: + - network1 + - spoolfs + - htcondor_secrets + - htcondor_setup + - htcondor_cm + - htcondor_execute_point_group + settings: + spool_parent_dir: /shared +``` + +[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type + Copyright 2023 Google LLC @@ -106,6 +163,7 @@ limitations under the License. | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no | | [zones](#input\_zones) | Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 888fa44cd6..94da6e7399 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -210,14 +210,14 @@ module "htcondor_ap" { update_policy = [{ instance_redistribution_type = "NONE" - replacement_method = "SUBSTITUTE" - max_surge_fixed = length(local.zones) + replacement_method = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement) + max_surge_fixed = 0 # must be 0 to preserve hostnames max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] stateful_ips = [{ diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 3f80cb6afd..292596f672 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -216,3 +216,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)" + type = string + default = "OPPORTUNISTIC" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} From f7d8708629358aafd0ee9fd8c89f9c2d7dc87b54 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 09:02:15 -0600 Subject: [PATCH 057/151] Improve MIG replacement policies for HTCondor Execute Points Continue using the default of OPPORTUNISTIC replacement of Execute Point VMs so that they are (typically) replaced when a job becomes idle. Strongly recommend this setting in the documentation but discuss the alternative of PROACTIVE or manually issuing updates via gcloud. --- .../compute/htcondor-execute-point/README.md | 32 +++++++++++++++++++ .../compute/htcondor-execute-point/main.tf | 2 +- .../htcondor-execute-point/variables.tf | 10 ++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index c381921b1d..44394e4380 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -127,6 +127,37 @@ the University of Wisconsin-Madison. Support for HTCondor is available via: [chtc]: https://chtc.cs.wisc.edu/ +## Behavior of Managed Instance Group (MIG) + +Regional [MIGs][mig] are used to provision Execute Points. By default, VMs +will be provisioned in any of the zones available in that region, however, it +can be constrained to run in fewer zones (or a single zone) using +[var.zones](#input_zones). + +By default, the VM replacement policy is set to [opportunistic]. In practice, +this means that the Execute Points will _NOT_ be automatically replaced by +Terraform when changes to the instance template / HTCondor configuration are +made. We recommend leaving this at the default value as it will allow the +HTCondor autoscaler to replace VMs when they become idle without disrupting +running jobs. + +However, if it is desired [var.update_policy][#input_update_policy] can be set +to "PROACTIVE" to enable automatic replacement. This will disrupt running jobs +and send them back to the queue. Alternatively, one can leave the setting at +"OPPORTUNISTIC" and update: + +- intentionally by issuing an update via Cloud Console or using gcloud (below) +- VMs becomes unhealthy or are otherwise automatically replaced (e.g. regular + Google Cloud maintenance) + +For example, to manually update all instances in a MIG: + +```text +gcloud compute instance-groups managed update-instances \ + <> --all-instances --region <> \ + --project <> --minimal-action replace +``` + ## Known Issues When using OS Login with "external users" (outside of the Google Cloud @@ -217,6 +248,7 @@ limitations under the License. | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | +| [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no | | [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no | | [zones](#input\_zones) | Zone(s) in which execute points may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index b7bc66442f..c6abbbf8b8 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -199,7 +199,7 @@ module "mig" { max_unavailable_percent = null min_ready_sec = 300 minimal_action = "REPLACE" - type = "OPPORTUNISTIC" + type = var.update_policy }] } diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 178c786c26..75fde2b84c 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -236,3 +236,13 @@ variable "shielded_instance_config" { enable_integrity_monitoring = true } } + +variable "update_policy" { + description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)" + type = string + default = "OPPORTUNISTIC" + validation { + condition = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy) + error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"." + } +} From d0756162d6822f294218ac1ef463ceae4d9295d6 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 09:43:46 -0600 Subject: [PATCH 058/151] Fix HTCondow Windows URI for latest 23.0 LTS release --- .../htcondor-install/templates/install-htcondor.ps1.tftpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 8ec2701c3f..54b6d35bba 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -15,7 +15,7 @@ Remove-Item "$runtime_installer" # download HTCondor installer $htcondor_installer = 'C:\htcondor.msi' -%{ if condor_version == "10.*" } +%{ if condor_version == "23.*" } Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/current/condor-Windows-x64.msi -OutFile "$htcondor_installer" %{ else ~} Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile "$htcondor_installer" From dbdca702efae96a432ae779611f1d0cfd2103874 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 10:20:29 -0600 Subject: [PATCH 059/151] Address feedback from #2140 for README formatting --- .../compute/htcondor-execute-point/README.md | 21 +++++++++------- .../scheduler/htcondor-access-point/README.md | 24 ++++++++++--------- .../htcondor-central-manager/README.md | 22 ++++++++++------- 3 files changed, 38 insertions(+), 29 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 44394e4380..fc73008151 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -134,17 +134,18 @@ will be provisioned in any of the zones available in that region, however, it can be constrained to run in fewer zones (or a single zone) using [var.zones](#input_zones). -By default, the VM replacement policy is set to [opportunistic]. In practice, -this means that the Execute Points will _NOT_ be automatically replaced by -Terraform when changes to the instance template / HTCondor configuration are -made. We recommend leaving this at the default value as it will allow the -HTCondor autoscaler to replace VMs when they become idle without disrupting -running jobs. - -However, if it is desired [var.update_policy][#input_update_policy] can be set +When the configuration of an Execute Point is changed, the MIG can be configured +to [replace the VM][replacement] using a "proactive" or "opportunistic" policy. +By default, the policy is set to opportunistic. In practice, this means that +Execute Points will _NOT_ be automatically replaced by Terraform when changes to +the instance template / HTCondor configuration are made. We recommend leaving +this at the default value as it will allow the HTCondor autoscaler to replace +VMs when they become idle without disrupting running jobs. + +However, if it is desired [var.update_policy](#input_update_policy) can be set to "PROACTIVE" to enable automatic replacement. This will disrupt running jobs and send them back to the queue. Alternatively, one can leave the setting at -"OPPORTUNISTIC" and update: +the default value of "OPPORTUNISTIC" and update: - intentionally by issuing an update via Cloud Console or using gcloud (below) - VMs becomes unhealthy or are otherwise automatically replaced (e.g. regular @@ -158,6 +159,8 @@ gcloud compute instance-groups managed update-instances \ --project <> --minimal-action replace ``` +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type + ## Known Issues When using OS Login with "external users" (outside of the Google Cloud diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 3a2acac2cc..05bec0c2ba 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -34,12 +34,14 @@ A regional [MIG][mig] is used to provision the Access Point, although only in any of the zones available in that region, however, it can be constrained to run in fewer zones (or a single zone) using [var.zones](#input_zones). -By default, the VM replacement policy is set to [opportunistic]. In practice, -this means that the Access Point will _NOT_ be automatically replaced by -Terraform when changes to the instance template / HTCondor configuration are -made. The Access Point is _NOT_ safe to replace automatically as its local storage -contains the state of the job queue. By default, the Access Point will be -replaced only when: +When the configuration of the Central Manager is changed, the MIG can be +configured to [replace the VM][replacement] using a "proactive" or +"opportunistic" policy. By default, the Access Point replacement policy is +opportunistic. In practice, this means that the Access Point will _NOT_ be +automatically replaced by Terraform when changes to the instance template / +HTCondor configuration are made. The Access Point is _NOT_ safe to replace +automatically as its local storage contains the state of the job queue. By +default, the Access Point will be replaced only when: - intentionally by issuing an update via Cloud Console or using gcloud (below) - the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular @@ -53,10 +55,10 @@ gcloud compute instance-groups managed update-instances \ --project <> --minimal-action replace ``` -This mode can be switched to "PROACTIVE" (automatic) replacement by setting -[var.update_policy][#input_update_policy]. In this case we recommend the use of -Filestore to store the job queue state ("spool") and setting -[var.spool_parent_dir][#input_spool_parent_dir] to its mount point: +This mode can be switched to proactive (automatic) replacement by setting +[var.update_policy](#input_update_policy) to "PROACTIVE". In this case we +recommend the use of Filestore to store the job queue state ("spool") and +setting [var.spool_parent_dir][#input_spool_parent_dir] to its mount point: ```yaml - id: spoolfs @@ -82,7 +84,7 @@ Filestore to store the job queue state ("spool") and setting spool_parent_dir: /shared ``` -[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type Copyright 2023 Google LLC diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 60d50a43df..6fe5256f15 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -31,14 +31,18 @@ A regional [MIG][mig] is used to provision the central manager, although only in any of the zones available in that region, however, it can be constrained to run in fewer zones (or a single zone) using [var.zones](#input_zones). -By default, the VM replacement policy is set to [proactive]. In practice, this -means that the Central Manager will be replaced by Terraform when changes to -the instance template / HTCondor configuration are made. The Central Manager is -safe to replace automatically as it gathers its state information from periodic -messages exchanged with the rest of the HTCondor pool. - -This mode can be switched to "OPPORTUNISTIC" by setting [var.update_policy][#input_update_policy]. -In this case, the Central Manager will be replaced only when: +When the configuration of the Central Manager is changed, the MIG can be +configured to [replace the VM][replacement] using a "proactive" or +"opportunistic" policy. By default, the Central Manager replacement policy is +set to proactive. In practice, this means that the Central Manager will be +replaced by Terraform when changes to the instance template / HTCondor +configuration are made. The Central Manager is safe to replace automatically as +it gathers its state information from periodic messages exchanged with the rest +of the HTCondor pool. + +This mode can be configured by setting [var.update_policy](#input_update_policy) +to either "PROACTIVE" (default) or "OPPORTUNISTIC". If set to opportunistic +replacement, the Central Manager will be replaced only when: - intentionally by issuing an update via Cloud Console or using gcloud (below) - the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular @@ -52,7 +56,7 @@ gcloud compute instance-groups managed update-instances \ --project <> --minimal-action replace ``` -[proactive]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type +[replacement]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type ## Limiting inter-zone egress From 68d937cab629533e272b1d0c74aa82d17a23268b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 16 Jan 2024 10:41:16 -0600 Subject: [PATCH 060/151] Fix broken link in HTCondor MIG documentation --- community/modules/compute/htcondor-execute-point/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index fc73008151..098abc2c4f 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -3,7 +3,7 @@ This module performs the following tasks: - create an instance template from which execute points will be created -- create a managed instance group (MIG) for execute points +- create a managed instance group ([MIG][mig]) for execute points - create a Toolkit runner to configure the autoscaler to scale the MIG It is expected to be used with the [htcondor-install] and [htcondor-setup] @@ -11,6 +11,7 @@ modules. [htcondor-install]: ../../scripts/htcondor-install/README.md [htcondor-setup]: ../../scheduler/htcondor-setup/README.md +[mig]: https://cloud.google.com/compute/docs/instance-groups/ ### Known limitations From a68d8f63bac44c004259f56716914a4e31d60fdf Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 17 Jan 2024 05:38:46 +0000 Subject: [PATCH 061/151] Remove intel-select blueprints and references --- community/examples/intel/README.md | 149 ----------- .../intel/hpc-intel-select-slurm.yaml | 162 ------------ docs/tutorials/README.md | 19 +- .../hpc-cluster-intel-select.yaml | 102 -------- docs/tutorials/intel-select/intel-select.md | 242 ------------------ examples/README.md | 12 - modules/README.md | 2 +- modules/packer/custom-image/README.md | 13 +- 8 files changed, 8 insertions(+), 693 deletions(-) delete mode 100644 community/examples/intel/hpc-intel-select-slurm.yaml delete mode 100644 docs/tutorials/intel-select/hpc-cluster-intel-select.yaml delete mode 100644 docs/tutorials/intel-select/intel-select.md diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 7172673363..439921e31a 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -33,155 +33,6 @@ - [Unmount the Container](#unmount-the-container) - [Delete the DAOS/Slurm Cluster infrastructure when not in use](#delete-the-daosslurm-cluster-infrastructure-when-not-in-use) -## Intel-Optimized Slurm Cluster - -This document is adapted from a [Cloud Shell tutorial][tutorial] developed to -demonstrate Intel Select Solutions within the Toolkit. It expands upon that -tutorial by building custom images that save provisioning time and improve -reliability when scaling up compute nodes. - -The Google Cloud [HPC VM Image][hpcvmimage] has a built-in feature enabling it -to install a Google Cloud-tested release of Intel compilers and libraries that -are known to achieve optimal performance on Google Cloud. - -[tutorial]: ../../../docs/tutorials/intel-select/intel-select.md -[hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm - -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. - -### Initial Setup for the Intel-Optimized Slurm Cluster - -Before provisioning any infrastructure in this project you should follow the -Toolkit guidance to enable [APIs][apis] and establish minimum resource -[quotas][quotas]. In particular, the following APIs should be enabled - -- [file.googleapis.com](https://cloud.google.com/filestore/docs/reference/rest) (Cloud Filestore) -- [compute.googleapis.com](https://cloud.google.com/compute/docs/reference/rest/v1#service:-compute.googleapis.com) (Google Compute Engine) - -[apis]: ../../../README.md#enable-gcp-apis -[quotas]: ../../../README.md#gcp-quotas - -And the following available quota is required in the region used by the cluster: - -- Filestore: 2560GB -- C2 CPUs: 4 (login node) -- C2 CPUs: up to 6000 (fully-scaled "compute" partition) - - This quota is not necessary at initial deployment, but will be required to - successfully scale the partition to its maximum size - -### Deploy the Slurm Cluster - -Use `ghpc` to provision the blueprint, supplying your project ID - -```text -ghpc create --vars project_id=<> community/examples/intel/hpc-intel-select-slurm.yaml -``` - -This will create a set of directories containing Terraform modules and Packer -templates. **Please ignore the printed instructions** in favor of the following: - -1. Provision the network and startup scripts that install Intel software - - ```shell - terraform -chdir=hpc-intel-select/primary init - terraform -chdir=hpc-intel-select/primary validate - terraform -chdir=hpc-intel-select/primary apply - ``` - -2. Capture the startup scripts to files that will be used by Packer to build the - images - - ```shell - terraform -chdir=hpc-intel-select/primary output \ - -raw startup_script_startup_controller > \ - hpc-intel-select/build1/controller-image/startup_script.sh - - terraform -chdir=hpc-intel-select/primary output \ - -raw startup_script_startup_compute > \ - hpc-intel-select/build2/compute-image/startup_script.sh - ``` - -3. Build the custom Slurm controller image. While this step is executing, you - may begin the next step in parallel. - - ```shell - cd hpc-intel-select/build1/controller-image - packer init . - packer validate . - packer build -var startup_script_file=startup_script.sh . - ``` - -4. Build the custom Slurm image for login and compute nodes - - ```shell - cd - - cd hpc-intel-select/build2/compute-image - packer init . - packer validate . - packer build -var startup_script_file=startup_script.sh . - ``` - -5. Provision the Slurm cluster - - ```shell - cd - - terraform -chdir=hpc-intel-select/cluster init - terraform -chdir=hpc-intel-select/cluster validate - terraform -chdir=hpc-intel-select/cluster apply - ``` - -### Connect to the login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console - - Ensure that you select the project in which you are provisioning the cluster. - -2. Click on the **SSH** button associated with the `slurm-hpc-intel-select-login0` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -### Access the cluster and provision an example job - - **The commands below should be run on the login node.** - -1. Create a default ssh key to be able to ssh between nodes - - ```shell - ssh-keygen -q -N '' -f ~/.ssh/id_rsa - cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys - chmod 0600 ~/.ssh/authorized_keys - ``` - -1. Submit an example job - - ```shell - cp /var/tmp/dgemm_job.sh . - sbatch dgemm_job.sh - ``` - -### Delete the infrastructure when not in use - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. - -Open your browser to the VM instances page and ensure that nodes named "compute" -have been shutdown and deleted by the Slurm autoscaler. Delete the remaining -infrastructure in reverse order of creation: - -```shell -terraform -chdir=hpc-intel-select/cluster destroy -terraform -chdir=hpc-intel-select/primary destroy -``` - ## DAOS Cluster The [pfs-daos.yaml](pfs-daos.yaml) blueprint describes an environment with diff --git a/community/examples/intel/hpc-intel-select-slurm.yaml b/community/examples/intel/hpc-intel-select-slurm.yaml deleted file mode 100644 index bb197ac533..0000000000 --- a/community/examples/intel/hpc-intel-select-slurm.yaml +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-intel-select-slurm - -vars: - deployment_name: hpc-intel-select - region: us-central1 - zone: us-central1-c - controller_image: - family: slurm-intel-hpc-controller - project: $(vars.project_id) - compute_image: - family: slurm-intel-hpc-compute - project: $(vars.project_id) - network_name: intel-select-net - subnetwork_name: intel-select-primary-subnet - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: startup_controller - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: /var/tmp/install_intel_controller.sh - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --prefix /apps --intel_compliance - outputs: - - startup_script - - - id: startup_compute - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: /var/tmp/install_intel_compute.sh - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --intel_comp_meta - - type: data - destination: /var/tmp/dgemm_job.sh - content: | - #!/bin/bash - #SBATCH --nodes=4 - #SBATCH --ntasks-per-node=30 - #SBATCH --time=01:00:00 - #SBATCH --job-name=clckjob - #SBATCH --output=job_%j.log - #SBATCH --partition=compute - . /apps/clck/2019.10/env/vars.sh - export CLCK_SHARED_TEMP_DIR=$HOME - cd $SLURM_SUBMIT_DIR - # select_solutions_sim_mod_user_base_2018.0 | select_solutions_sim_mod_user_plus_2018.0 - FWD=select_solutions_sim_mod_user_base_2018.0 - clck -D ${FWD}.db -F ${FWD} -l debug - outputs: - - startup_script - -- group: build1 - modules: - - id: controller-image - source: modules/packer/custom-image - kind: packer - settings: - disk_size: 20 - source_image_project_id: [schedmd-slurm-public] - source_image_family: schedmd-slurm-21-08-8-hpc-centos-7 - image_family: $(vars.controller_image.family) - -- group: build2 - modules: - - id: compute-image - source: modules/packer/custom-image - kind: packer - settings: - disk_size: 20 - source_image_project_id: [schedmd-slurm-public] - source_image_family: schedmd-slurm-21-08-8-hpc-centos-7 - image_family: $(vars.compute_image.family) - -- group: cluster - modules: - - id: cluster-network - source: modules/network/pre-existing-vpc - - - id: homefs - source: modules/file-system/filestore - use: - - cluster-network - settings: - local_mount: /home - - # This debug_partition will work out of the box without requesting additional GCP quota. - - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - cluster-network - - homefs - settings: - partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-4 - instance_image: $(vars.compute_image) - - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - cluster-network - - homefs - settings: - partition_name: compute - instance_image: $(vars.compute_image) - max_node_count: 100 - machine_type: c2-standard-60 - bandwidth_tier: gvnic_enabled - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - cluster-network - - compute_partition - - homefs - settings: - login_node_count: 1 - instance_image: $(vars.controller_image) - controller_machine_type: c2-standard-4 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - cluster-network - - slurm_controller - - homefs - settings: - instance_image: $(vars.compute_image) - login_machine_type: c2-standard-4 diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md index f56e99074c..66904bd0b7 100644 --- a/docs/tutorials/README.md +++ b/docs/tutorials/README.md @@ -5,19 +5,6 @@ Find the quickstart tutorial on [Google Cloud docs](https://cloud.google.com/hpc-toolkit/docs/quickstarts/slurm-cluster). -## Intel Select Tutorial - -Walks through deploying an HPC cluster that is based on the -[HPC virtual machine (VM) image][hpc-vm-image] and complies to the -[Intel Select Solution for Simulation and Modeling criteria][intel-select]. - -Click the button below to launch the Intel Select tutorial. - -[![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fhpc-toolkit&cloudshell_open_in_editor=docs%2Ftutorials%2Fintel-select%2Fhpc-cluster-intel-select.yaml&cloudshell_tutorial=docs%2Ftutorials%2Fintel-select%2Fintel-select.md) - -[hpc-vm-image]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[intel-select]: https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/simulation-modeling.html - ## HTCondor Tutorial Walk through deploying an HTCondor pool that supports jobs running inside Docker @@ -27,6 +14,8 @@ Click the button below to launch the HTCondor tutorial. [![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fhpc-toolkit&cloudshell_open_in_editor=community%2Fexamples%2Fhtc-htcondor.yaml&cloudshell_tutorial=docs%2Ftutorials%2Fhtcondor.md) +[hpc-vm-image]: https://cloud.google.com/compute/docs/instances/create-hpc-vm + ## SC-23 Tutorial [Blueprint](./sc23-tutorial/hcls-blueprint.yaml) used in the Supercomputing 2023 tutorial “Unlocking the potential of HPC in the Google Cloud with Open-Source Tools” @@ -61,11 +50,11 @@ modules relate to each other. ```mermaid graph TB - A(Virtual Private Cloud) + A(Virtual Private Cloud) C(Spack Install Script) D(Startup Scripts) E(Compute Partition) - F(Slurm Controller) + F(Slurm Controller) G(Slurm Login Node) B(Monitoring Dashboard) C --> D diff --git a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml deleted file mode 100644 index dfe2a9f276..0000000000 --- a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-cluster-intel-select - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-intel-select - region: us-central1 - zone: us-central1-c - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: homefs - source: modules/file-system/filestore - use: [network1] - settings: - local_mount: /home - - - id: startup-controller - source: modules/scripts/startup-script - settings: - runners: - - type: shell - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --prefix /apps --intel_compliance - destination: /var/tmp/install_intel_controller.sh - - - id: startup-compute - source: modules/scripts/startup-script - settings: - runners: - - type: shell - content: | - #!/bin/bash - yum -y update google-hpc-compute - google_install_mpi --intel_comp_meta - destination: /var/tmp/install_intel_compute.sh - - # This debug_partition will work out of the box without requesting additional GCP quota. - - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-2 - - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - settings: - partition_name: compute - max_node_count: 20 - - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition - settings: - login_node_count: 1 - controller_startup_script: $(startup-controller.startup_script) - compute_startup_script: $(startup-compute.startup_script) - - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_startup_script: $(startup-compute.startup_script) diff --git a/docs/tutorials/intel-select/intel-select.md b/docs/tutorials/intel-select/intel-select.md deleted file mode 100644 index 13956ea3de..0000000000 --- a/docs/tutorials/intel-select/intel-select.md +++ /dev/null @@ -1,242 +0,0 @@ -# HPC Toolkit Intel Select Solution Cluster Deployment - -HPC Toolkit is an open-source software offered by Google Cloud which makes it -easy for customers to deploy HPC environments on Google Cloud. - -This tutorial will walk you through deploying an HPC cluster that is based on -the [HPC virtual machine (VM) image](https://cloud.google.com/compute/docs/instances/create-hpc-vm) -and comply to the [Intel Select Solution for Simulation and Modeling criteria](https://www.intel.com/content/www/us/en/products/solutions/select-solutions/hpc/simulation-modeling.html). - -[Click here for more information](https://cloud.google.com/compute/docs/instances/create-intel-select-solution-hpc-clusters). - -## Select a Project - -Select a project in which to deploy an HPC cluster on Google . - - - -Once you have selected a project, click START. - -## Enable APIs & Permissions - -*Skip this step if you already ran this as part of a previous tutorial.* - -In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `terraform apply` -but you can save time by enabling them now by running: - - - -We also need to grant the default compute service account project edit access so -the slurm controller can perform actions such as auto-scaling. - - - -```bash -PROJECT_NUMBER=$(gcloud projects describe --format='value(projectNumber)') - -echo "granting roles/editor to $PROJECT_NUMBER-compute@developer.gserviceaccount.com" - -gcloud iam service-accounts enable --project $PROJECT_NUMBER-compute@developer.gserviceaccount.com - -gcloud projects add-iam-policy-binding --member=serviceAccount:$PROJECT_NUMBER-compute@developer.gserviceaccount.com --role=roles/editor -``` - -## Build the Toolkit Binary - -*Skip this step if you already ran this as part of a previous tutorial.* - -To build HPC Toolkit binary from source run: - -```bash -make -``` - -You should now have a binary named ghpc in the current directory. To verify the -build run: - -```bash -./ghpc --version -``` - -This should show you the version of the HPC Toolkit you are using. - -## Generate a Deployment - -This tutorial will use the blueprint docs/tutorials/intel-select/hpc-cluster-intel-select.yaml, which should be open in the Cloud Shell Editor (on the left). - -This file describes the cluster you will deploy. It contains: - -* a new network -* a filestore instance -* a custom startup script for the slurm controller -* a custom startup script for the slurm login and compute nodes -* a Slurm cluster with Intel software components pre-installed throughout - * a Slurm login node - * a Slurm controller - * several auto-scaling Slurm partitions - -Do you notice the difference between this blueprint and the hpc-slurm example? - -After you have inspected the file, use the ghpc binary to create a deployment folder by running: - -```bash -./ghpc create --vars project_id= docs/tutorials/intel-select/hpc-cluster-intel-select.yaml -``` - -> **_NOTE:_** The `--vars` argument is used to override `project_id` in the -> deployment variables. - -This will create a deployment directory named `hpc-intel-select/`, which -contains the terraform needed to deploy your cluster. - -## Deploy the Cluster - -Use the following commands to run terraform and deploy your cluster. - -```bash -terraform -chdir=hpc-intel-select/primary init -terraform -chdir=hpc-intel-select/primary apply -``` - -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 5 minutes. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - - -```shell -Apply complete! Resources: xx added, 0 changed, 0 destroyed. -``` - -## Waiting for the cluster to be configured - -Although the cluster has been successfully deployed, the startup scripts that -install the additional required software take time to complete. Typically, this -can be around 8 minutes on the controller and 2-3 minutes on the login and -compute nodes. - -If you see the following message when you SSH into the login node following the -instructions in the next step, you should logout and give more time for the -startup script to complete. - -> _`Slurm is currently being configured in the background`_ - -Running the following command will allow monitoring the startup scripts on the controller: - -```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-hpc-intel-select-controller | grep startup-script -``` - -And the login node: - -```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-hpc-intel-select-login0 | grep startup-script -``` - -The following line would indicate that the startup script completed on the controller: ->_`slurm-hpc-intel-select-controller google_metadata_script_runner: startup-script exit status 0`_ - -## Connecting to the login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. This will take you to `Compute Engine` > - `VM instances` in the Google Cloud Console: - - - - ```text - https://console.cloud.google.com/compute?project= - ``` - - - - -1. Click on the `SSH` button associated with the `slurm-hpc-small-login0` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -## Run a Job on the Cluster - - **The commands below should be run on the login node.** - -1. Create a default ssh key to be able to ssh between nodes: - - ```shell - ssh-keygen -N '' -f ~/.ssh/id_rsa - cp ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys - chmod 0600 ~/.ssh/authorized_keys - ``` - -1. Execute the following commands to activate Intel software components and - allocate machines to run the Intel Cluster Checker: - -```shell -export PATH=/apps/intelpython3/bin/:/sbin:/bin:/usr/sbin:/usr/bin:$PATH -source /apps/clck/2019.10/bin/clckvars.sh -source /apps/psxe_runtime/linux/bin/psxevars.sh -salloc -N4 -p compute -``` - -This may take a minute while Slurm auto-scales to create the nodes. If you are -curious you can refresh the `Compute Engine` > `VM instances` page and see that -additional VMs have been created. - -If the allocation fails, try submitting the job to the debug partition, -by removing the `-p compute` parameter to `salloc`. The message `salloc: -PrologSlurmctld failed, job killed` most likely indicates that your project does -not have sufficient quota for C2 instances in your region. - -1. Once the allocation is complete, you will be presented with a shell. Run: - -```shell -clck -F intel_hpc_platform_compat-hpc-2018.0 -``` - -Notice this job took ~2-3 minutes to start, since all compute nodes have to install the packages at boot time. In a real production system, this would be part of the slurm image (which is also possible with the HPC Toolkit). - -Since we used the compute partition, the job ran on [Compute Optimized -instances](https://cloud.google.com/compute/docs/compute-optimized-machines), -using Intel 3.9 GHz Cascade Lake processors and with placement groups enabled. -Nodes will not be re-used across jobs and will be immediately destroyed after -the job is completed. - -The outputs of `clck` will be stored in `clck_execution_warnings.log` and `clck_results.log`. - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. - -## Destroy the Cluster - -To avoid incurring ongoing charges we will want to destroy our cluster. Run the -following command in the cloud shell terminal (not in the pop-up): - -```bash -terraform -chdir=hpc-intel-select/primary destroy -auto-approve -``` - -When complete you should see something like: - -```shell -Destroy complete! Resources: xx destroyed. -``` - -> **_NOTE:_** If destroy is run before Slurm shut down the auto-scale nodes then -> they will be left behind and destroy may fail. In this case you can delete the -> VMs manually and rerun the destroy command above. - -## Tutorial Complete - - diff --git a/examples/README.md b/examples/README.md index b5f192ec09..202d9ce328 100644 --- a/examples/README.md +++ b/examples/README.md @@ -21,7 +21,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml-) ![community-badge] - * [hpc-intel-select-slurm.yaml](#hpc-intel-select-slurmyaml-) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] @@ -620,17 +619,6 @@ For this example the following is needed in the selected region: * Compute Engine API: Resource policies: **one for each job in parallel** - _only needed for `compute` partition_ -### [hpc-intel-select-slurm.yaml] ![community-badge] - -This example provisions a Slurm cluster automating the [steps to comply to the -Intel Select Solutions for Simulation & Modeling Criteria][intelselect]. It is -more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[hpc-intel-select-slurm.yaml]: ../community/examples/intel/hpc-intel-select-slurm.yaml -[intel-examples-readme]: ../community/examples/intel/README.md -[intelselect]: https://cloud.google.com/compute/docs/instances/create-intel-select-solution-hpc-clusters - ### [pfs-daos.yaml] ![community-badge] This example provisions a DAOS cluster with [managed instance groups][migs] for the servers and for clients. It is more extensively discussed in a dedicated [README for Intel diff --git a/modules/README.md b/modules/README.md index a12d92fcb6..8ffa15a206 100644 --- a/modules/README.md +++ b/modules/README.md @@ -301,7 +301,7 @@ repository: * Hosted on [GitHub](https://developer.hashicorp.com/terraform/language/modules/sources#github) * Google Cloud Storage [Buckets](https://developer.hashicorp.com/terraform/language/modules/sources#gcs-bucket) * Generic [git repositories](https://developer.hashicorp.com/terraform/language/modules/sources#generic-git-repository) - + when modules are in a subdirectory of the git repository, a special double-slash `//` notation can be required as described below diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 86f1251e8b..b9b33ccdea 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -31,6 +31,9 @@ images to internal projects. [shell]: #input_shell_scripts [ansible]: #input_ansible_playbooks [hpcimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm +[Image Builder]: ../../../examples/image-builder.yaml +[startup-script]: ../../../modules/scripts/startup-script +[examples README]: ../../../examples/README.md#image-builderyaml- [startup-metadata]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux ## Example blueprints @@ -48,16 +51,6 @@ The [Image Builder] blueprint demonstrates a solution that builds an image using Please review the [examples README] for usage instructions. -### Intel-Optimized Slurm Cluster - -The [Intel-Optimized] Slurm Cluster [blueprint](../../../community/examples/intel/hpc-intel-select-slurm.yaml) -adds the Intel compliance software on top of a Slurm on GCP image. - -[Image Builder]: ../../../examples/image-builder.yaml -[startup-script]: ../../../modules/scripts/startup-script -[examples README]: ../../../examples/README.md#image-builderyaml- -[Intel-Optimized]: ../../../community/examples/intel/README.md#intel-optimized-slurm-cluster - ## Order of execution The startup script specified in metadata executes in parallel with the other From 6cd572a70ef26dee2f7e3297a4dcdf3388b12b6b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 17 Jan 2024 17:54:48 -0800 Subject: [PATCH 062/151] Add support for string interpolation (#2076) * Add support for string interpolation * Support proper escaping * Adress comments --- examples/README.md | 14 +- pkg/config/config.go | 14 +- pkg/config/config_test.go | 14 +- pkg/config/expand.go | 19 --- pkg/config/expand_test.go | 54 ------- pkg/config/expression.go | 214 ++++++++++++++++---------- pkg/config/expression_test.go | 90 +++++------ pkg/config/yaml.go | 52 +++++-- pkg/modulewriter/modulewriter_test.go | 21 --- 9 files changed, 236 insertions(+), 256 deletions(-) diff --git a/examples/README.md b/examples/README.md index 202d9ce328..4d7d9ad79a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1225,8 +1225,6 @@ The variable is referred to by the source, either vars for deploment variables or the module ID for module variables, followed by the name of the value being referenced. The entire variable is then wrapped in “$()”. -Currently, string interpolation with variables is not supported. - ### Literal Variables Literal variables should only be used by those familiar @@ -1250,6 +1248,18 @@ Whenever possible, blueprint variables are preferred over literal variables. `ghpc` will perform basic validation making sure all blueprint variables are defined before creating a deployment, making debugging quicker and easier. +### String Interpolation + +The `$(...)` expressions can be used within strings, see: + +```yaml +settings: + title: Magnificent $(vars.name) + script: | + #!/bin/bash + echo "Hello $(vars.project_id) from $(vars.region)" +``` + ### Escape Variables Under circumstances where the variable notation conflicts with the content of a setting or string, for instance when defining a startup-script runner that uses a subshell like in the example below, a non-quoted backslash (`\`) can be used as an escape character. It preserves the literal value of the next character that follows: diff --git a/pkg/config/config.go b/pkg/config/config.go index a6a2578a18..39087a31d2 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -440,17 +440,15 @@ func validateModuleUseReferences(p ModulePath, mod Module, bp Blueprint) error { } func checkBackend(b TerraformBackend) error { - const errMsg = "can not use variables in terraform_backend block, got '%s=%s'" - // TerraformBackend.Type is typed as string, "simple" variables and HCL literals stay "as is". - if hasVariable(b.Type) { - return fmt.Errorf(errMsg, "type", b.Type) - } - if _, is := IsYamlExpressionLiteral(cty.StringVal(b.Type)); is { - return fmt.Errorf(errMsg, "type", b.Type) + err := errors.New("can not use expressions in terraform_backend block") + val, perr := parseYamlString(b.Type) + + if _, is := IsExpressionValue(val); is || perr != nil { + return err } return cty.Walk(b.Configuration.AsObject(), func(p cty.Path, v cty.Value) (bool, error) { if _, is := IsExpressionValue(v); is { - return false, fmt.Errorf("can not use variables in terraform_backend block") + return false, err } return true, nil }) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 03a374ef4d..99ec466f3c 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -672,22 +672,22 @@ func (s *zeroSuite) TestCheckBackends(c *C) { { // FAIL. Variable in defaults type b := TerraformBackend{Type: "$(vartype)"} - c.Check(check(b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(b), NotNil) } { // FAIL. Variable in group backend type b := TerraformBackend{Type: "$(vartype)"} - c.Check(check(dummy, b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(dummy, b), NotNil) } { // FAIL. Deployment variable in defaults type b := TerraformBackend{Type: "$(vars.type)"} - c.Check(check(b), ErrorMatches, ".*type.*vars\\.type.*") + c.Check(check(b), NotNil) } { // FAIL. HCL literal b := TerraformBackend{Type: "((var.zen))"} - c.Check(check(b), ErrorMatches, ".*type.*zen.*") + c.Check(check(b), NotNil) } { // OK. Not a variable @@ -697,13 +697,13 @@ func (s *zeroSuite) TestCheckBackends(c *C) { { // FAIL. Mid-string variable in defaults type b := TerraformBackend{Type: "hugs_$(vartype)_hugs"} - c.Check(check(b), ErrorMatches, ".*type.*vartype.*") + c.Check(check(b), NotNil) } { // FAIL. Variable in defaults configuration b := TerraformBackend{Type: "gcs"} b.Configuration.Set("bucket", GlobalRef("trenta").AsExpression().AsValue()) - c.Check(check(b), ErrorMatches, ".*can not use variables.*") + c.Check(check(b), NotNil) } { // OK. handles nested configuration @@ -714,7 +714,7 @@ func (s *zeroSuite) TestCheckBackends(c *C) { "alpha": cty.StringVal("a"), "beta": GlobalRef("boba").AsExpression().AsValue(), })) - c.Check(check(b), ErrorMatches, ".*can not use variables.*") + c.Check(check(b), NotNil) } } diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 3431b7917d..c69578fad4 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -17,7 +17,6 @@ package config import ( "errors" "fmt" - "regexp" "hpc-toolkit/pkg/modulereader" @@ -32,14 +31,6 @@ const ( deploymentLabel string = "ghpc_deployment" ) -var ( - // Checks if a variable exists only as a substring, ex: - // Matches: "a$(vars.example)", "word $(vars.example)", "word$(vars.example)", "$(vars.example)" - // Doesn't match: "\$(vars.example)", "no variable in this string" - anyVariableExp *regexp.Regexp = regexp.MustCompile(`(^|[^\\])\$\((.*?)\)`) - simpleVariableExp *regexp.Regexp = regexp.MustCompile(`^\$\((.*)\)$`) -) - // expand expands variables and strings in the yaml config. Used directly by // ExpandConfig for the create and expand commands. func (dc *DeploymentConfig) expand() error { @@ -419,16 +410,6 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error return nil } -// isSimpleVariable checks if the entire string is just a single variable -func isSimpleVariable(str string) bool { - return simpleVariableExp.MatchString(str) -} - -// hasVariable checks to see if any variable exists in a string -func hasVariable(str string) bool { - return anyVariableExp.MatchString(str) -} - // FindAllIntergroupReferences finds all intergroup references within the group func (dg DeploymentGroup) FindAllIntergroupReferences(bp Blueprint) []Reference { igcRefs := map[Reference]bool{} diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 8becc02c5e..2805fe7750 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -338,60 +338,6 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { GlobalRef("gold").AsExpression().AsValue()) } -func (s *zeroSuite) TestIsSimpleVariable(c *C) { - // True: Correct simple variable - got := isSimpleVariable("$(some_text)") - c.Assert(got, Equals, true) - // False: Missing $ - got = isSimpleVariable("(some_text)") - c.Assert(got, Equals, false) - // False: Missing ( - got = isSimpleVariable("$some_text)") - c.Assert(got, Equals, false) - // False: Missing ) - got = isSimpleVariable("$(some_text") - c.Assert(got, Equals, false) - // False: Contains Prefix - got = isSimpleVariable("prefix-$(some_text)") - c.Assert(got, Equals, false) - // False: Contains Suffix - got = isSimpleVariable("$(some_text)-suffix") - c.Assert(got, Equals, false) - // False: Contains prefix and suffix - got = isSimpleVariable("prefix-$(some_text)-suffix") - c.Assert(got, Equals, false) - // False: empty string - got = isSimpleVariable("") - c.Assert(got, Equals, false) -} - -func (s *zeroSuite) TestHasVariable(c *C) { - // True: simple variable - got := hasVariable("$(some_text)") - c.Assert(got, Equals, true) - // True: has prefix - got = hasVariable("prefix-$(some_text)") - c.Assert(got, Equals, true) - // True: has suffix - got = hasVariable("$(some_text)-suffix") - c.Assert(got, Equals, true) - // True: Two variables - got = hasVariable("$(some_text)$(some_more)") - c.Assert(got, Equals, true) - // True: two variable with other text - got = hasVariable("prefix-$(some_text)-$(some_more)-suffix") - c.Assert(got, Equals, true) - // False: missing $ - got = hasVariable("(some_text)") - c.Assert(got, Equals, false) - // False: missing ( - got = hasVariable("$some_text)") - c.Assert(got, Equals, false) - // False: missing ) - got = hasVariable("$(some_text") - c.Assert(got, Equals, false) -} - func (s *zeroSuite) TestValidateModuleReference(c *C) { a := Module{ID: "moduleA"} b := Module{ID: "moduleB"} diff --git a/pkg/config/expression.go b/pkg/config/expression.go index 2bf4be5ddb..d3a5d4dc99 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -22,6 +22,7 @@ import ( "github.com/hashicorp/hcl/v2" "github.com/hashicorp/hcl/v2/hclsyntax" "github.com/hashicorp/hcl/v2/hclwrite" + "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/function" "github.com/zclconf/go-cty/cty/function/stdlib" @@ -54,41 +55,9 @@ func (r Reference) AsExpression() Expression { return MustParseExpression(fmt.Sprintf("module.%s.%s", r.Module, r.Name)) } -// MakeStringInterpolationError generates an error message guiding the user to proper escape syntax -func MakeStringInterpolationError(s string) error { - matchall := anyVariableExp.FindAllString(s, -1) - hint := "" - for _, element := range matchall { - // the regex match will include the first matching character - // this might be (1) "^" or (2) any character EXCEPT "\" - // if (2), we have to remove the first character from the match - if element[0:2] != "$(" { - element = strings.Replace(element, element[0:1], "", 1) - } - hint += "\\" + element + " will be rendered as " + element + "\n" - } - return fmt.Errorf( - "variables \"$(...)\" within strings are not yet implemented. remove them or add a backslash to render literally. \n%s", hint) -} - -// Takes `$(expression)` and returns `expression` -func extractSimpleVarExpression(s string) (string, error) { - if !hasVariable(s) { - return "", fmt.Errorf("%#v is not a variable", s) - } - if !isSimpleVariable(s) { - return "", MakeStringInterpolationError(s) - } - contents := simpleVariableExp.FindStringSubmatch(s) - if len(contents) != 2 { // Should always be (match, contents) here - return "", fmt.Errorf("%s %s, failed to extract contents: %v", errMsgInvalidVar, s, contents) - } - return contents[1], nil -} - // Takes traversal in "blueprint namespace" (e.g. `vars.zone` or `homefs.mount`) // and transforms it to `Expression`. -func simpleTraversalToExpression(t hcl.Traversal) (Expression, error) { +func bpTraversalToExpression(t hcl.Traversal) (Expression, error) { if len(t) < 2 { return nil, fmt.Errorf(expectedVarFormat) } @@ -116,12 +85,8 @@ func simpleTraversalToExpression(t hcl.Traversal) (Expression, error) { }, nil } -// SimpleVarToExpression takes a string `$(...)` and transforms it to `Expression` -func SimpleVarToExpression(s string) (Expression, error) { - s, err := extractSimpleVarExpression(s) - if err != nil { - return nil, err - } +// bpLitToExpression takes a content of `$(...)`-literal and transforms it to `Expression` +func bpLitToExpression(s string) (Expression, error) { hexp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) if diag.HasErrors() { return nil, diag @@ -129,7 +94,7 @@ func SimpleVarToExpression(s string) (Expression, error) { switch texp := hexp.(type) { case *hclsyntax.ScopeTraversalExpr: - exp, err := simpleTraversalToExpression(texp.Traversal) + exp, err := bpTraversalToExpression(texp.Traversal) if err != nil { return nil, fmt.Errorf("failed to parse variable %q: %w", s, err) } @@ -175,20 +140,6 @@ func TraversalToReference(t hcl.Traversal) (Reference, error) { } } -// IsYamlExpressionLiteral checks if passed value of type cty.String -// and its content starts with "((" and ends with "))". -// Returns trimmed string and result of test. -func IsYamlExpressionLiteral(v cty.Value) (string, bool) { - if v.Type() != cty.String { - return "", false - } - s := v.AsString() - if len(s) < 4 || s[:2] != "((" || s[len(s)-2:] != "))" { - return "", false - } - return s[2 : len(s)-2], true -} - // Expression is a representation of expressions in Blueprint type Expression interface { // Eval evaluates the expression in the context of Blueprint @@ -207,6 +158,7 @@ type Expression interface { } // ParseExpression returns Expression +// Expects expression in "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`) func ParseExpression(s string) (Expression, error) { e, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) if diag.HasErrors() { @@ -339,18 +291,6 @@ func HasMark[T any](val cty.Value) (T, bool) { return tgt, found } -func escapeBlueprintVariables(s string) string { - // Convert \$(not.variable) to $(not.variable) - re := regexp.MustCompile(`\\\$\(`) - return re.ReplaceAllString(s, `$(`) -} - -func escapeLiteralVariables(s string) string { - // Convert \((not.variable)) to ((not.variable)) - re := regexp.MustCompile(`\\\(\(`) - return re.ReplaceAllString(s, `((`) -} - // TokensForValue is a modification of hclwrite.TokensForValue. // The only difference in behavior is handling "HCL literal" strings. func TokensForValue(val cty.Value) hclwrite.Tokens { @@ -362,20 +302,8 @@ func TokensForValue(val cty.Value) hclwrite.Tokens { if e, is := IsExpressionValue(val); is { return e.Tokenize() } - val, _ = val.Unmark() // remove marks, as we don't need them anymore - if s, is := IsYamlExpressionLiteral(val); is { // return it "as is" - return hclwrite.TokensForIdentifier(s) - } - + val, _ = val.Unmark() // remove marks, as we don't need them anymore ty := val.Type() - if ty == cty.String { - s := val.AsString() - // The order of application matters, for an edge cases like: `\$\((` -> `$((` - s = escapeLiteralVariables(s) - s = escapeBlueprintVariables(s) - return hclwrite.TokensForValue(cty.StringVal(s)) - } - if ty.IsListType() || ty.IsSetType() || ty.IsTupleType() { tl := []hclwrite.Tokens{} for it := val.ElementIterator(); it.Next(); { @@ -442,3 +370,131 @@ func evalValue(v cty.Value, bp Blueprint) (cty.Value, error) { return v, nil }) } + +type pToken struct { + s string + e Expression +} + +func tokenizeBpString(s string) ([]pToken, error) { + toks := []pToken{} + var exp Expression + var err error + bsRe := regexp.MustCompile(`\\*$`) // to count number of backslashes at the end + + for len(s) > 0 { + i := strings.Index(s, "$(") + if i == -1 { // plain string until the end + toks, s = append(toks, pToken{s: s}), "" // add everything + break // and terminate + } + p := s[:i] + s = s[i+2:] // split as `p$(s` + bs := len(bsRe.FindString(p)) // get number of trailing backslashes + p = p[:len(p)-bs+bs/2] // keep (smaller) half of backslashes + toks = append(toks, pToken{s: p}) // add tokens up to "$(" + + if bs%2 == 1 { // escaped $( + toks = append(toks, pToken{s: "$("}) // add "$(" + } else { // found beginning of expression + exp, s, err = greedyParseHcl(s) // parse after "$(" + if err != nil { + return nil, err + } + toks = append(toks, pToken{e: exp}) // add expression + } + } + return toks, nil +} + +func compactTokens(toks []pToken) []pToken { + res := []pToken{} + for _, t := range toks { + if t.e != nil { + res = append(res, t) // add as is + } else { + if t.s == "" { + continue // skip + } + if len(res) > 0 && res[len(res)-1].e == nil { + res[len(res)-1].s += t.s // merge with previous + } else { + res = append(res, t) // add as is + } + } + } + return res +} + +func parseBpLit(s string) (cty.Value, error) { + toks, err := tokenizeBpString(s) + if err != nil { + return cty.NilVal, err + } + toks = compactTokens(toks) + if len(toks) == 0 { + return cty.StringVal(""), nil + } + if len(toks) == 1 { + if toks[0].e != nil { + return toks[0].e.AsValue(), nil + } else { + return cty.StringVal(toks[0].s), nil + } + } + + exp, err := buildStringInterpolation(toks) + if err != nil { + return cty.NilVal, err + } + return exp.AsValue(), nil +} + +// greedyParseHcl tries to parse prefix of `s` as a valid HCL expression. +// It iterates over all closing brackets and tries to parse expression up to them. +// The shortest expression is returned. E.g: +// "var.hi) $(var.there)" -> "var.hi" +// "try(var.this) + one(var.time)) tail" -> "try(var.this) + one(var.time)" +func greedyParseHcl(s string) (Expression, string, error) { + err := errors.New("no closing parenthesis") + for i := 0; i < len(s); i++ { + if s[i] != ')' { + continue + } + _, diag := hclsyntax.ParseExpression([]byte(s[:i]), "", hcl.Pos{}) + if !diag.HasErrors() { // found an expression + exp, err := bpLitToExpression(s[:i]) + return exp, s[i+1:], err + } + err = diag // save error, try to find another closing bracket + } + return nil, s, err +} + +func buildStringInterpolation(pts []pToken) (Expression, error) { + toks := hclwrite.Tokens{&hclwrite.Token{ + Type: hclsyntax.TokenOQuote, + Bytes: []byte(`"`)}, + } + + for _, pt := range pts { + if pt.e != nil { + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenTemplateInterp, + Bytes: []byte(`${`)}) + toks = append(toks, pt.e.Tokenize()...) + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenTemplateSeqEnd, + Bytes: []byte(`}`)}) + } else { + stoks := hclwrite.TokensForValue(cty.StringVal(pt.s)) + stoks = stoks[1 : len(stoks)-1] // remove quotes + toks = append(toks, stoks...) + } + } + + toks = append(toks, &hclwrite.Token{ + Type: hclsyntax.TokenCQuote, + Bytes: []byte(`"`)}) + return ParseExpression(string(toks.Bytes())) +} diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index 74703a67ff..e3cd7dc1d8 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -71,41 +71,14 @@ func TestTraversalToReference(t *testing.T) { } } -func TestIsYamlHclLiteral(t *testing.T) { - type test struct { - input string - want string - check bool - } - tests := []test{ - {"((var.green))", "var.green", true}, - {"((${var.green}))", "${var.green}", true}, - {"(( 7 + a }))", " 7 + a }", true}, - {"(var.green)", "", false}, - {"((var.green)", "", false}, - {"$(var.green)", "", false}, - {"${var.green}", "", false}, - } - for _, tc := range tests { - t.Run(tc.input, func(t *testing.T) { - got, check := IsYamlExpressionLiteral(cty.StringVal(tc.input)) - if diff := cmp.Diff(tc.want, got); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tc.check, check); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } - }) - } -} - -func TestSimpleVarToExpression(t *testing.T) { +func TestParseBpLit(t *testing.T) { type test struct { input string want string err bool } tests := []test{ + // Single expression, without string interpolation {"$(vars.green)", "var.green", false}, {"$(vars.green[3])", "var.green[3]", false}, {"$(vars.green.sleeve)", "var.green.sleeve", false}, @@ -119,9 +92,34 @@ func TestSimpleVarToExpression(t *testing.T) { {"$(box.green.sleeve[3])", "module.box.green.sleeve[3]", false}, {`$(box.green["sleeve"])`, `module.box.green["sleeve"]`, false}, + // String interpolation + {`1gold was here`, `"1gold was here"`, false}, + {`2gold $(vars.here)`, `"2gold ${var.here}"`, false}, + {`3gold $(vars.here) but $(vars.gone)`, `"3gold ${var.here} but ${var.gone}"`, false}, + {`4gold +$(vars.here)`, `"4gold\n${var.here}"`, false}, // quoted strings may not be split over multiple lines + + {`5gold +was here`, `"5gold\nwas here"`, false}, + {"6gold $(vars.here", ``, true}, // missing close parenthesis + {"7gold $(vars.here + 2)", ``, true}, // unsupported expression + + {`#!/bin/bash +echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hello ${var.project_id} from ${var.region}\""`, false}, + {"", `""`, false}, + {`$(try(vars.this) + one(vars.time))`, "", true}, // fails because of unsupported expression, but it should be parsed + + // Escaping + {`q $(vars.t)`, `"q ${var.t}"`, false}, // no escaping + {`q \$(vars.t)`, `"q $(vars.t)"`, false}, // escaped `$(` + {`q \\$(vars.t)`, `"q \\${var.t}"`, false}, // escaped `\` + {`q \\\$(vars.t)`, `"q \\$(vars.t)"`, false}, // escaped both `\` and `$(` + {`q \\\\$(vars.t)`, `"q \\\\${var.t}"`, false}, // escaped `\\` + {`q \\\\\$(vars.t)`, `"q \\\\$(vars.t)"`, false}, // escaped both `\\` and `$(` + + // Untranslatable expressions {"$(vars)", "", true}, {"$(sleeve)", "", true}, - {"gold $(var.here)", "", true}, {"$(box[3])", "", true}, // can't index module {`$(box["green"])`, "", true}, // can't index module {"$(vars[3]])", "", true}, // can't index vars @@ -129,14 +127,22 @@ func TestSimpleVarToExpression(t *testing.T) { } for _, tc := range tests { t.Run(tc.input, func(t *testing.T) { - exp, err := SimpleVarToExpression(tc.input) + v, err := parseBpLit(tc.input) if tc.err != (err != nil) { t.Errorf("got unexpected error: %s", err) } if err != nil { return } - got := string(exp.Tokenize().Bytes()) + var got string + if v.Type() == cty.String { + got = string(hclwrite.TokensForValue(v).Bytes()) + } else if exp, is := IsExpressionValue(v); is { + got = string(exp.Tokenize().Bytes()) + } else { + t.Fatalf("got value of unexpected type: %#v", v) + } + if diff := cmp.Diff(tc.want, got); diff != "" { t.Errorf("diff (-want +got):\n%s", diff) } @@ -167,26 +173,6 @@ func TestTokensForValueNoLiteral(t *testing.T) { } } -func TestTokensForValueWithLiteral(t *testing.T) { - val := cty.ObjectVal(map[string]cty.Value{ - "tan": cty.TupleVal([]cty.Value{ - cty.StringVal("((var.kilo + 8))"), // HCL literal - MustParseExpression("var.tina + 4").AsValue(), // HclExpression value - })}) - want := ` -{ - tan = [var.kilo + 8, var.tina + 4] -}`[1:] - - gotF := hclwrite.NewEmptyFile() - gotF.Body().AppendUnstructuredTokens(TokensForValue(val)) - got := hclwrite.Format(gotF.Bytes()) // format to normalize whitespace - - if diff := cmp.Diff(want, string(got)); diff != "" { - t.Errorf("diff (-want +got):\n%s", diff) - } -} - func TestFlattenFunctionCallExpression(t *testing.T) { bp := Blueprint{Vars: NewDict(map[string]cty.Value{ "three": cty.NumberIntVal(3), diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index 77f67c2d1a..264cc6daaf 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -22,6 +22,7 @@ import ( "os" "regexp" "strconv" + "strings" "github.com/hashicorp/hcl/v2/hclwrite" "github.com/pkg/errors" @@ -253,26 +254,34 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { if err != nil { return err } - y.Wrap(v) - if l, is := IsYamlExpressionLiteral(y.Unwrap()); is { // HCL literal - var e Expression - if e, err = ParseExpression(l); err != nil { - // TODO: point to exact location within expression, see Diagnostic.Subject - return nodeToPosErr(n, err) - } - y.Wrap(e.AsValue()) - } else if y.Unwrap().Type() == cty.String && hasVariable(y.Unwrap().AsString()) { // "simple" variable - e, err := SimpleVarToExpression(y.Unwrap().AsString()) - if err != nil { - // TODO: point to exact location within expression, see Diagnostic.Subject - return nodeToPosErr(n, err) + if v.Type() == cty.String { + if v, err = parseYamlString(v.AsString()); err != nil { + return fmt.Errorf("line %d: %w", n.Line, err) } - y.Wrap(e.AsValue()) } + y.Wrap(v) return nil } +func isHCLLiteral(s string) bool { + return strings.HasPrefix(s, "((") && strings.HasSuffix(s, "))") +} + +func parseYamlString(s string) (cty.Value, error) { + if isHCLLiteral(s) { + if e, err := ParseExpression(s[2 : len(s)-2]); err != nil { + return cty.NilVal, err + } else { + return e.AsValue(), nil + } + } + if strings.HasPrefix(s, `\((`) && strings.HasSuffix(s, `))`) { + return cty.StringVal(s[1:]), nil // escaped HCL literal + } + return parseBpLit(s) +} + func (y *YamlValue) unmarshalObject(n *yaml.Node) error { var my map[string]YamlValue if err := n.Decode(&my); err != nil { @@ -318,10 +327,25 @@ func (d *Dict) UnmarshalYAML(n *yaml.Node) error { // MarshalYAML implements custom YAML marshaling. func (d Dict) MarshalYAML() (interface{}, error) { o, _ := cty.Transform(d.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { + if v.IsNull() { + return v, nil + } if e, is := IsExpressionValue(v); is { s := string(hclwrite.Format(e.Tokenize().Bytes())) return cty.StringVal("((" + s + "))"), nil } + if v.Type() == cty.String { + // Need to escape back the non-expressions (both HCL and blueprint ones) + s := v.AsString() + if isHCLLiteral(s) { + // yaml: "\((foo))" -unmarshal-> cty: "((foo))" -marshall-> yaml: "\((foo))" + // NOTE: don't attempt to escape both HCL and blueprint expressions + // they don't get unmarshalled together, terminate here + return cty.StringVal(`\` + s), nil + } + // yaml: "\$(var.foo)" -unmarshal-> cty: "$(var.foo)" -marshall-> yaml: "\$(var.foo)" + return cty.StringVal(strings.ReplaceAll(s, `$(`, `\$(`)), nil + } return v, nil }) diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 39e599ccc4..cd38aa07f5 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -529,27 +529,6 @@ func (s *MySuite) TestWritePackerAutoVars(c *C) { } -func (s *zeroSuite) TestStringEscape(c *C) { - f := func(s string) string { - toks := config.TokensForValue(cty.StringVal(s)) - return string(toks.Bytes()) - } - // LiteralVariables - c.Check(f(`\((not.var))`), Equals, `"((not.var))"`) - c.Check(f(`abc\((not.var))abc`), Equals, `"abc((not.var))abc"`) - c.Check(f(`abc \((not.var)) abc`), Equals, `"abc ((not.var)) abc"`) - c.Check(f(`abc \((not.var1)) abc \((not.var2)) abc`), Equals, `"abc ((not.var1)) abc ((not.var2)) abc"`) - c.Check(f(`abc \\((escape.backslash))`), Equals, `"abc \\((escape.backslash))"`) - - // BlueprintVariables - c.Check(f(`\$(not.var)`), Equals, `"$(not.var)"`) - c.Check(f(`abc\$(not.var)abc`), Equals, `"abc$(not.var)abc"`) - c.Check(f(`abc \$(not.var) abc`), Equals, `"abc $(not.var) abc"`) - c.Check(f(`abc \$(not.var1) abc \$(not.var2) abc`), Equals, `"abc $(not.var1) abc $(not.var2) abc"`) - c.Check(f(`abc \\$(escape.backslash)`), Equals, `"abc \\$(escape.backslash)"`) - -} - func (s *zeroSuite) TestDeploymentSource(c *C) { { // git m := config.Module{Kind: config.TerraformKind, Source: "github.com/x/y.git"} From 3e61febc3fd0b93ef60f3d33431cfe9b029fbc73 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 18 Jan 2024 12:30:52 -0800 Subject: [PATCH 063/151] UX. Enable output colorization by default (#2145) --- cmd/color.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/color.go b/cmd/color.go index 9eab8ef6f2..171c5767c4 100644 --- a/cmd/color.go +++ b/cmd/color.go @@ -30,7 +30,7 @@ func init() { } func addColorFlag(flagset *pflag.FlagSet) { - flagset.BoolVar(&noColorFlag, "no-color", true, "Disable colorized output.") + flagset.BoolVar(&noColorFlag, "no-color", false, "Disable colorized output.") } func initColor() { From 3d5958f12b866385972d5c48ffca3ca5b8c2a8fd Mon Sep 17 00:00:00 2001 From: Mark Olson <115657904+mark-olson@users.noreply.github.com> Date: Fri, 19 Jan 2024 08:07:54 -0800 Subject: [PATCH 064/151] Update DAOS blueprints to use google-cloud-daos v0.5.0, slurm v6 [DAOSGCP-182](https://daosio.atlassian.net/browse/DAOSGCP-182) - Bump version of DAOS modules to v0.5.0 which install DAOS v2.4 - Modify community/examples/intel/hpc-slurm-daos.yaml to use Slurm v6 modules - Add temporary fix to community/examples/intel/hpc-slurm-daos.yaml to work around issue with missing lustre-client 8.8 repo - Update community/examples/intel/README.md to account for changes in DAOS v2.4 Signed-off-by: Mark Olson <115657904+mark-olson@users.noreply.github.com> --- community/examples/intel/README.md | 145 +++++++++-------- community/examples/intel/hpc-slurm-daos.yaml | 158 ++++++++++++------- community/examples/intel/pfs-daos.yaml | 44 +++--- 3 files changed, 197 insertions(+), 150 deletions(-) diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 439921e31a..8b0f4072d8 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -4,12 +4,6 @@ - [Intel Solutions for the HPC Toolkit](#intel-solutions-for-the-hpc-toolkit) - - [Intel-Optimized Slurm Cluster](#intel-optimized-slurm-cluster) - - [Initial Setup for the Intel-Optimized Slurm Cluster](#initial-setup-for-the-intel-optimized-slurm-cluster) - - [Deploy the Slurm Cluster](#deploy-the-slurm-cluster) - - [Connect to the login node](#connect-to-the-login-node) - - [Access the cluster and provision an example job](#access-the-cluster-and-provision-an-example-job) - - [Delete the infrastructure when not in use](#delete-the-infrastructure-when-not-in-use) - [DAOS Cluster](#daos-cluster) - [Initial Setup for DAOS Cluster](#initial-setup-for-daos-cluster) - [Deploy the DAOS Cluster](#deploy-the-daos-cluster) @@ -17,7 +11,7 @@ - [Verify the DAOS storage system](#verify-the-daos-storage-system) - [Create a DAOS Pool and Container](#create-a-daos-pool-and-container) - [About the DAOS Command Line Tools](#about-the-daos-command-line-tools) - - [Determine Free Space](#determine-free-space) + - [View Free Space](#view-free-space) - [Create a Pool](#create-a-pool) - [Create a Container](#create-a-container) - [Mount the DAOS Container](#mount-the-daos-container) @@ -47,16 +41,22 @@ for general information on building custom images using the Toolkit. Identify a project to work in and substitute its unique id wherever you see `<>` in the instructions below. +[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos +[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md +[DAOS Yum Repository]: https://packages.daos.io + ### Initial Setup for DAOS Cluster Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [pfs-daos.yaml](pfs-daos.yaml) blueprint will build the images as part of the deployment. -The Pre-deployment Guide provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md +The Pre-deployment Guide provides instructions for: +- installing the Google Cloud CLI +- enabling service accounts +- enabling APIs +- establishing minimum resource quotas +- creating a Cloud NAT to allow instances without public IPs to access the [DAOS Yum Repository] repository. ### Deploy the DAOS Cluster @@ -98,7 +98,7 @@ ghpc deploy pfs-daos --auto-approve The `community/examples/intel/pfs-daos.yaml` blueprint does not contain configuration for DAOS pools and containers. Therefore, pools and containers will need to be created manually. -Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.2/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. +Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.4/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. Verify that the storage system has been formatted and that the daos-server instances have joined. @@ -123,35 +123,24 @@ Both daos-server instances should show a state of *Joined*. #### About the DAOS Command Line Tools -The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.2/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.2/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. +The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.4/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.4/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. -The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.2/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. +The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.4/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. -#### Determine Free Space +#### View Free Space -Determine how much free space is available. +View how much free space is available. ```bash sudo dmg storage query usage ``` -The result will look similar to - -```text -Hosts SCM-Total SCM-Free SCM-Used NVMe-Total NVMe-Free NVMe-Used ------ --------- -------- -------- ---------- --------- --------- -daos-server-0001 215 GB 215 GB 0 % 6.4 TB 6.4 TB 0 % -daos-server-0002 215 GB 215 GB 0 % 6.4 TB 6.4 TB 0 % -``` - -In the example output above we see that there is a total of 12.8TB NVME-Free. - #### Create a Pool -Create a single pool owned by root which uses all available free space. +Create a single pool owned by root which uses 100% of the available free space. ```bash -sudo dmg pool create -z 12.8TB -t 3 -u root --label=pool1 +sudo dmg pool create --size=100% --user=root pool1 ``` Set ACLs to allow any user to create a container in *pool1*. @@ -160,7 +149,7 @@ Set ACLs to allow any user to create a container in *pool1*. sudo dmg pool update-acl -e A::EVERYONE@:rcta pool1 ``` -See the [Pool Operations](https://docs.daos.io/v2.2/admin/pool_operations) section of the of the DAOS Administration Guide for more information about creating pools. +See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the of the DAOS Administration Guide for more information about creating pools. #### Create a Container @@ -170,24 +159,18 @@ and how it will be used. The ACLs will need to be set properly to allow users an For the purpose of this demo create the container without specifying ACLs. The container will be owned by your user account and you will have full access to the container. ```bash -daos cont create pool1 \ - --label cont1 \ - --type POSIX \ - --properties rf:0 +daos container create --type=POSIX --properties=rf:0 pool1 cont1 ``` -See the [Container Management](https://docs.daos.io/v2.2/user/container) section of the of the DAOS User Guide for more information about creating containers. +See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the of the DAOS User Guide for more information about creating containers. #### Mount the DAOS Container Mount the container with dfuse (DAOS Fuse) ```bash -mkdir -p ${HOME}/daos/cont1 -dfuse --singlethread \ - --pool=pool1 \ - --container=cont1 \ - --mountpoint=${HOME}/daos/cont1 +mkdir -p "${HOME}/daos/cont1" +dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" ``` Verify that the container is mounted @@ -207,27 +190,47 @@ time LD_PRELOAD=/usr/lib64/libioil.so \ dd if=/dev/zero of="${HOME}/daos/cont1/test20GiB.img" iflag=fullblock bs=1G count=20 ``` -See the [File System](https://docs.daos.io/v2.2/user/filesystem/) section of the DAOS User Guide for more information about DFuse. +**Known Issue:** -### Unmount the DAOS Container +When you run `ls -lh "${HOME}/daos/cont1"` you may see that the `test20GiB.img` file shows a size of 0 bytes. -The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. +If you unmount the container and mount it again, the file size will show as 20G. ```bash -fusermount3 -u ${HOME}/daos/cont1 +fusermount3 -u "${HOME}/daos/cont1" +dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" +ls -lh "${HOME}/daos/cont1" +``` + +A work-around for this issue to disable caching when mounting the container. + +``` +dfuse --singlethread --disable-caching --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" ``` +See the [File System](https://docs.daos.io/v2.4/user/filesystem/) section of the DAOS User Guide for more information about DFuse. + +### Unmount the DAOS Container + +The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. + Verify that the container is unmounted ```bash df -h -t fuse.daos ``` -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.2/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. +Logout of the DAOS client instance. + +```bash +logout +``` + +See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. ### Delete the DAOS infrastructure when not in use -> **_NOTE:_** All the DAOS data will be permanently lost after cluster deletion. +> **_NOTE:_** Data stored in the DAOS container will be permanently lost after cluster deletion. Delete the remaining infrastructure @@ -237,21 +240,21 @@ ghpc destroy pfs-daos --auto-approve ## DAOS Server with Slurm cluster -The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint describes an environment with a Slurm cluster and four DAOS server instances. The compute nodes are configured as DAOS clients and have the ability to use the DAOS filesystem on the DAOS server instances. +The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint can be used to deploy a Slurm cluster and four DAOS server instances. The Slurm compute instances are configured as DAOS clients. The blueprint uses modules from - [google-cloud-daos][google-cloud-daos] -- [community/modules/scheduler/SchedMD-slurm-on-gcp-controller][SchedMD-slurm-on-gcp-controller] -- [community/modules/scheduler/SchedMD-slurm-on-gcp-login-node][SchedMD-slurm-on-gcp-login-node] -- [community/modules/compute/SchedMD-slurm-on-gcp-partition][SchedMD-slurm-on-gcp-partition] +- [community/modules/compute/schedmd-slurm-gcp-v6-nodeset][schedmd-slurm-gcp-v6-nodeset] +- [community/modules/compute/schedmd-slurm-gcp-v6-partition][schedmd-slurm-gcp-v6-partition] +- [community/modules/scheduler/schedmd-slurm-gcp-v6-login][schedmd-slurm-gcp-v6-login] +- [community/modules/scheduler/schedmd-slurm-gcp-v6-controller][schedmd-slurm-gcp-v6-controller] The blueprint also uses a Packer template from the [Google Cloud DAOS][google-cloud-daos] repository. Please review the [introduction to image building](../../../docs/image-building.md) for general information on building custom images using the Toolkit. -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. +Substitute your project ID wherever you see `<>` in the instructions below. ### Initial Setup for the DAOS/Slurm cluster @@ -259,16 +262,16 @@ Before provisioning the DAOS cluster you must follow the steps listed in the [Go Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint will build the DAOS server image as part of the deployment. -The Pre-deployment Guide provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. +The [Pre-deployment Guide][pre-deployment_guide] provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. [google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos [pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md - [packer-template]: https://github.com/daos-stack/google-cloud-daos/blob/main/images/daos.pkr.hcl [apis]: ../../../README.md#enable-gcp-apis -[SchedMD-slurm-on-gcp-controller]: ../../modules/scheduler/SchedMD-slurm-on-gcp-controller -[SchedMD-slurm-on-gcp-login-node]: ../../modules/scheduler/SchedMD-slurm-on-gcp-login-node -[SchedMD-slurm-on-gcp-partition]: ../../modules/compute/SchedMD-slurm-on-gcp-partition +[schedmd-slurm-gcp-v6-nodeset]: ../../modules/compute/schedmd-slurm-gcp-v6-nodeset +[schedmd-slurm-gcp-v6-partition]: ../../modules/compute/schedmd-slurm-gcp-v6-partition +[schedmd-slurm-gcp-v6-controller]: ../../modules/scheduler/schedmd-slurm-gcp-v6-controller +[schedmd-slurm-gcp-v6-login]: ../../modules/scheduler/schedmd-slurm-gcp-v6-login Follow the Toolkit guidance to enable [APIs][apis] and establish minimum resource [quotas][quotas] for Slurm. @@ -301,7 +304,7 @@ The `--backend-config` option is not required but recommended. It will save the Follow `ghpc` instructions to deploy the environment ```text -ghpc deploy daos-slurm --auto-approve +ghpc deploy hpc-slurm-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -319,7 +322,7 @@ Once the startup script has completed and Slurm reports readiness, connect to th Select the project in which the cluster will be provisionsd. -2. Click on the `SSH` button associated with the `slurm-daos-slurm-login0` +2. Click on the `SSH` button associated with the `hpcslurmda-login-login-001` instance. This will open a separate pop up window with a terminal into our newly created @@ -334,10 +337,12 @@ You will need to create your own DAOS container in the pool that can be used by While logged into the login node create a container named `cont1` in the `pool1` pool: ```bash -daos cont create --type=POSIX --properties=rf:0 --label=cont1 pool1 +daos cont create --type=POSIX --properties=rf:0 pool1 cont1 ``` -Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account in order to access the container. +NOTE: If you encounter an error `daos: command not found`, it's likely that the startup scripts have not finished running yet. Wait a few minutes and try again. + +Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account to access the container. Create a mount point for the container and mount it with dfuse (DAOS Fuse) @@ -389,6 +394,7 @@ echo "Job ${SLURM_JOB_ID} running on ${JOB_HOSTNAME}" | tee "${MOUNT_DIR}/${TIME echo "${JOB_HOSTNAME} : Unmounting dfuse" fusermount3 -u "${MOUNT_DIR}" + ``` Run the `daos_job.sh` script in an interactive Slurm job on 4 nodes @@ -426,21 +432,20 @@ Verify that the container is unmounted df -h -t fuse.daos ``` -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.2/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. +See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. ### Delete the DAOS/Slurm Cluster infrastructure when not in use -> **_NOTE:_** All the DAOS data will be permanently lost after cluster deletion. +> **_NOTE:_** All data on the DAOS file system will be permanently lost after cluster deletion. - +> **_NOTE:_** If the Slurm controller is shut down before the auto-scale instances +> are destroyed those instances will be left running. -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. +Open your browser to the VM instances page and ensure that instances named "compute" +have been shutdown and deleted by the Slurm autoscaler. -Open your browser to the VM instances page and ensure that nodes named "compute" -have been shutdown and deleted by the Slurm autoscaler. Delete the remaining -infrastructure with `terraform`: +Delete the remaining infrastructure: ```shell -ghpc destroy daos-slurm --auto-approve +ghpc destroy hpc-slurm-daos --auto-approve ``` diff --git a/community/examples/intel/hpc-slurm-daos.yaml b/community/examples/intel/hpc-slurm-daos.yaml index cd79bdc203..acc99c9050 100644 --- a/community/examples/intel/hpc-slurm-daos.yaml +++ b/community/examples/intel/hpc-slurm-daos.yaml @@ -1,4 +1,4 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,38 +18,42 @@ blueprint_name: hpc-slurm-daos vars: project_id: ## Set GCP Project ID Here ## - deployment_name: daos-slurm + deployment_name: hpc-slurm-daos region: us-central1 zone: us-central1-c - server_image_family: daos-server-hpc-rocky-8 - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + daos_server_image_family: daos-server-hpc-rocky-8 + daos_version: "2.4" + tags: [] # Note: this blueprint assumes the existence of a default global network and # subnetwork in the region chosen above +validators: +- validator: test_module_not_used + inputs: {} + skip: true + deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: homefs source: modules/file-system/filestore use: [network1] settings: - local_mount: "/home" + local_mount: /home - group: daos-server-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - id: daos-server-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 - daos_repo_base_url: https://packages.daos.io + daos_version: $(vars.daos_version) + daos_repo_base_url: https://packages.daos.io/ daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true enable_oslogin: false @@ -63,26 +67,25 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: server - image_family: $(vars.server_image_family) + image_family: $(vars.daos_server_image_family) - group: cluster modules: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos - source: github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" use: [network1] settings: labels: {ghpc_role: file-system} - # The default DAOS settings are optimized for TCO - # The following will tune this system for best perf machine_type: "n2-standard-16" - os_family: $(vars.server_image_family) + os_family: $(vars.daos_server_image_family) daos_disk_count: 4 - daos_scm_size: 45 + tags: $(vars.tags) pools: - name: "pool1" - size: "6.4TB" - tier_ratio: 3 + size: "100%" + # Do not set value for scm_size when size=100% + daos_scm_size: user: "root@" group: "root@" acls: @@ -98,67 +101,102 @@ deployment_groups: settings: runners: - type: shell - content: $(daos.daos_client_install_script) - destination: /tmp/daos_client_install.sh + destination: remove_lustre_client_repo.sh + content: | + #!/bin/bash + rm -f /etc/yum.repos.d/lustre-client.repo + dnf clean all --verbose + rm -rf /var/cache/dnf/* + dnf makecache - type: data content: $(daos.daos_agent_yml) destination: /etc/daos/daos_agent.yml - type: data content: $(daos.daos_control_yml) destination: /etc/daos/daos_control.yml + - type: shell + content: $(daos.daos_client_install_script) + destination: /tmp/daos_client_install.sh - type: shell content: $(daos.daos_client_config_script) - destination: /var/daos/daos_client_config.sh + destination: /tmp/daos_client_config.sh + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + name: ns1 + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false # the default is: true + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" - ## This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset, homefs] settings: partition_name: debug - max_node_count: 4 - enable_placement: false - machine_type: n2-standard-2 + exclusive: false # allows nodes to stay up after jobs are done + is_default: true - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] settings: - partition_name: compute - max_node_count: 20 + name: ns2 + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition - - daos-client-script + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs] settings: - login_node_count: 1 - compute_node_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" + partition_name: compute - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + tags: $(vars.tags) + service_account: + email: null + scopes: + - "https://www.googleapis.com/auth/monitoring.write" + - "https://www.googleapis.com/auth/logging.write" + - "https://www.googleapis.com/auth/devstorage.read_only" + - "https://www.googleapis.com/auth/cloud-platform" + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - debug_partition + - compute_partition + - slurm_login - homefs - - slurm_controller - daos-client-script settings: - login_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" + disable_controller_public_ips: false + compute_startup_script: $(daos-client-script.startup_script) + controller_startup_script: $(daos-client-script.startup_script) + login_startup_script: $(daos-client-script.startup_script) + compute_startup_scripts_timeout: 1000 + controller_startup_scripts_timeout: 1000 + login_startup_scripts_timeout: 1000 + tags: $(vars.tags) diff --git a/community/examples/intel/pfs-daos.yaml b/community/examples/intel/pfs-daos.yaml index 648aba9403..3abf5c9778 100644 --- a/community/examples/intel/pfs-daos.yaml +++ b/community/examples/intel/pfs-daos.yaml @@ -1,4 +1,4 @@ -# Copyright 2022 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,11 +21,10 @@ vars: deployment_name: pfs-daos region: us-central1 zone: us-central1-c - server_image_family: daos-server-hpc-rocky-8 - client_image_family: daos-client-hpc-rocky-8 - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + daos_server_image_family: daos-server-hpc-rocky-8 + daos_client_image_family: daos-client-hpc-rocky-8 + daos_version: "2.4" + tags: [] # Note: this blueprint assumes the existence of a default global network and # subnetwork in the region chosen above @@ -38,12 +37,12 @@ deployment_groups: - group: daos-server-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - id: daos-server-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 + daos_version: $(vars.daos_version) daos_repo_base_url: https://packages.daos.io daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true @@ -58,16 +57,16 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: server - image_family: $(vars.server_image_family) + image_family: $(vars.daos_server_image_family) - group: daos-client-image modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/images + # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.5.0/images - id: daos-client-image - source: github.com/daos-stack/google-cloud-daos//images?ref=v0.4.1&depth=1 + source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" kind: packer settings: - daos_version: 2.2.0 + daos_version: $(vars.daos_version) daos_repo_base_url: https://packages.daos.io daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo use_iap: true @@ -82,24 +81,29 @@ deployment_groups: use_internal_ip: true omit_external_ip: false daos_install_type: client - image_family: $(vars.client_image_family) + image_family: $(vars.daos_client_image_family) - group: daos-cluster modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/terraform/modules/daos_server + # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_server - id: daos-server - source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.4.1&depth=1 + # source: $(vars.daos_server_module_source_url) + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" use: [network1] settings: number_of_instances: 2 labels: {ghpc_role: file-system} - os_family: $(vars.server_image_family) + os_family: $(vars.daos_server_image_family) + daos_scm_size: "172" + tags: $(vars.tags) - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.4.1/terraform/modules/daos_client + # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_client - id: daos-client - source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_client?ref=v0.4.1&depth=1 + # source: $(vars.daos_client_module_source_url) + source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_client?ref=v0.5.0&depth=1" use: [network1, daos-server] settings: number_of_instances: 2 labels: {ghpc_role: compute} - os_family: $(vars.client_image_family) + os_family: $(vars.daos_client_image_family) + tags: $(vars.tags) From 91255e69127d04642bc32c448235ce918a812635 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Fri, 19 Jan 2024 17:07:33 +0000 Subject: [PATCH 065/151] Added validation and error message to login_startup_scripts_timeout because it is broken --- .../scheduler/schedmd-slurm-gcp-v5-controller/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index a36449123b..f251c20765 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -128,6 +128,11 @@ variable "login_startup_scripts_timeout" { EOD type = number default = 300 + + validation { + condition = var.login_startup_scripts_timeout == 300 + error_message = "Changes to login_startup_scripts_timeout (default: 300s) are not respected, this will be fixed in a later release" + } } variable "cgroup_conf_tpl" { From 17399f0b40fd17a914ed126a2307412b9ba04ea4 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 17 Jan 2024 05:07:35 +0000 Subject: [PATCH 066/151] Update spack gromac example tutorial and reference to use Slurm V6 --- docs/tutorials/gromacs/spack-gromacs.md | 67 +++++++++-------------- docs/tutorials/gromacs/spack-gromacs.yaml | 52 ++++++++++-------- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index c8719aaba7..ce8400e1e5 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -5,7 +5,7 @@ easy for customers to deploy HPC environments on Google Cloud. In this tutorial you will use the HPC Toolkit to: -* Deploy a [Slurm](https://github.com/SchedMD/slurm-gcp#readme) HPC cluster on +* Deploy a [Slurm](https://github.com/GoogleCloudPlatform/slurm-gcp#readme) HPC cluster on Google Cloud * Use [Spack](https://spack.io/) to install the Gromacs application and all of its dependencies @@ -13,10 +13,10 @@ In this tutorial you will use the HPC Toolkit to: cluster * Tear down the cluster -Estimated time to complete: -The tutorial takes 2 hr. to complete, -of which 1.5 hr is for installing software -(without cache). +Estimated time to complete: +The tutorial takes 2 hr. to complete, +of which 1.5 hr is for installing software +(without cache). > **_NOTE:_** With a complete Spack cache, the tutorial takes 30 min. @@ -75,7 +75,7 @@ which should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. It defines: -* the existing default network from your project +* a vpc network * a monitoring dashboard with metrics on your cluster * a definition of a custom Spack installation * a startup script that @@ -84,7 +84,6 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster - * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -106,27 +105,13 @@ contains the terraform needed to deploy your cluster. ## Deploy the Cluster -Use the following commands to run terraform and deploy your cluster. +Use below command to deploy your cluster. ```bash -terraform -chdir=spack-gromacs/primary init -terraform -chdir=spack-gromacs/primary apply +./ghpc deploy spack-gromacs ``` -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 30 seconds. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - +After the deployment is finished, you should see below message. ```shell Apply complete! Resources: xx added, 0 changed, 0 destroyed. @@ -144,30 +129,30 @@ controller. This command can be used to view progress and check for completion of the startup script: ```bash -gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project slurm-spack-gromacs-controller | grep google_metadata_script_runner +gcloud compute instances get-serial-port-output --port 1 --zone us-central1-c --project spackgroma-controller | grep google_metadata_script_runner ``` When the startup script has finished running you will see the following line as the final output from the above command: -> _`slurm-spack-gromacs-controller google_metadata_script_runner: Finished running startup scripts.`_ +> _`spackgroma-controller google_metadata_script_runner: Finished running startup scripts.`_ Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`slurm-spack-gromacs-controller` and `slurm-spack-gromacs-login0`. If you don't +`spackgroma-controller`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the login node +## Connecting to the controller node -Once the startup script has completed, connect to the login node. +Once the startup script has completed, connect to the controller node. -Use the following command to ssh into the login node from cloud shell: +Use the following command to ssh into the controller node from cloud shell: ```bash -gcloud compute ssh slurm-spack-gromacs-login0 --zone us-central1-c --project +gcloud compute ssh spackgroma-controller --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -191,15 +176,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `slurm-spack-gromacs-login0` +1. Click on the `SSH` button associated with the `spackgroma-controller` instance. This will open a separate pop up window with a terminal into our newly - created Slurm login VM. + created Slurm controller VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm login node.** + **The commands below should be run on the Slurm controller node.** We will use the submission script (see line 122 of the blueprint) to submit a Gromacs job. @@ -213,7 +198,7 @@ Gromacs job. 2. Submit the job to Slurm to be scheduled: ```bash - sbatch /apps/gromacs/submit_gromacs.sh + sbatch /opt/apps/gromacs/submit_gromacs.sh ``` 3. Once submitted, you can watch the job progress by repeatedly calling the @@ -227,7 +212,7 @@ The `sbatch` command trigger Slurm to auto-scale up several nodes to run the job You can refresh the `Compute Engine` > `VM instances` page and see that additional VMs are being/have been created. These will be named something like -`slurm-spack-gromacs-compute-0-0`. +`spackgroma-comput-0`. When running `squeue`, observe the job status start as `CF` (configuring), change to `R` (running) once the compute VMs have been created, and finally `CG` @@ -247,8 +232,8 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `md.log` and `slurm-1.out` files have information on the run such as -performance. You can view these files by running the following commandsq on the -login node: +performance. You can view these files by running the following commands on the +controller node: ```bash cat slurm-*.out @@ -273,9 +258,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the login node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash @@ -285,7 +270,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -terraform -chdir=spack-gromacs/primary destroy -auto-approve +./ghpc destroy spack-gromacs ``` When complete you should see something like: diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index fe5bf475b1..285443c0b8 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: hpc_dash source: modules/monitoring/dashboard @@ -35,8 +35,8 @@ deployment_groups: - id: spack-setup source: community/modules/scripts/spack-setup settings: - install_dir: /apps/spack - spack_ref: v0.19.0 + install_dir: /opt/apps/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -88,7 +88,7 @@ deployment_groups: # fi # spack buildcache keys --install --trust - spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml @@ -107,22 +107,26 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + # remove lustre client temporary to avoid startup failure due to known + # issue. + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo - $(spack-execute.spack_runner) - type: shell destination: setup_gromacs.sh content: | #!/bin/bash - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs - chmod -R a+rwX /apps/spack/var/spack/environments/gromacs - mkdir -p /apps/gromacs - chmod a+rwx /apps/gromacs - cd /apps/gromacs + cd /opt/apps/gromacs wget --no-verbose https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz tar xzf water_GMX50_bare.tar.gz - type: data - destination: /apps/gromacs/submit_gromacs.sh + destination: /opt/apps/gromacs/submit_gromacs.sh content: | #!/bin/bash #SBATCH -N 2 @@ -131,36 +135,36 @@ deployment_groups: # Size can be 0000.65 0000.96 0001.5 0003 0006 0012 0024 0048 0096 0192 0384 0768 1536 3072 # Type can be 'pme' or 'rf' - source /apps/spack/share/spack/setup-env.sh + source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs # Check that gmx_mpi exists which gmx_mpi cd $SLURM_SUBMIT_DIR - cp /apps/gromacs/water-cut1.0_GMX50_bare/1536/* . + cp /opt/apps/gromacs/water-cut1.0_GMX50_bare/1536/* . scontrol show hostnames ${SLURM_JOB_NODELIST} > hostfile gmx_mpi grompp -f pme.mdp -c conf.gro -p topol.top -o input.tpr mpirun -n 60 -hostfile hostfile -ppn 30 gmx_mpi mdrun -notunepme -dlb yes -v -resethway -noconfout -nsteps 4000 -s input.tpr + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + bandwidth_tier: gvnic_enabled + - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] settings: partition_name: compute - max_node_count: 20 - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition settings: + disable_controller_public_ips: false + controller_startup_scripts_timeout: 21600 controller_startup_script: $(controller-setup.startup_script) - login_node_count: 1 - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - slurm_controller From 214fcbd4a22484ccf374459b5c0359cfcac4f7e9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 19 Jan 2024 15:09:10 -0800 Subject: [PATCH 067/151] Slurm6. Advance to 6.3.1 (#2146) --- .../schedmd-slurm-gcp-v6-controller/README.md | 22 +++++++++---------- .../controller.tf | 8 +++---- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 8 +++---- .../slurm_files.tf | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 05d8feee3f..33b371f149 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.2.1 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.2.1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.2.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.2.1 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.2.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.2.1 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.2.1 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.2.1 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.2.1 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.1_20240118 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.1_20240118 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.1_20240118 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.1_20240118 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.1_20240118 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.1_20240118 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.1_20240118 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.1_20240118 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 9ef0b3f790..5db82e74da 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.1_20240118" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -148,7 +148,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.1_20240118" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name @@ -164,7 +164,7 @@ module "cleanup_compute_nodes" { # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.1_20240118" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 09d1a0a5d2..594e96184c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -59,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.1_20240118" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index b2e030cf59..0e0ab5a471 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" for_each = local.nodeset_map project_id = var.project_id @@ -60,7 +60,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.1_20240118" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -79,7 +79,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.1_20240118" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -101,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.1_20240118" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index f6a264198b..abadd4d454 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.2.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.1_20240118" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name From 416f39171756b94d4f4d3d45b36a9592ce846889 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 21 Jan 2024 18:12:24 -0800 Subject: [PATCH 068/151] Add commands to verify monitoring agents are active --- modules/scripts/startup-script/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 8b42319d90..7d9e027943 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -161,6 +161,16 @@ sudo bash add-logging-agent-repo.sh --also-install sudo service stackdriver-agent start ``` +You can test if one of the agents is running using the following commands: + +```bash +# For Cloud Ops Agent +sudo systemctl status google-cloud-ops-agent + +# For Stackdriver Agent +sudo service stackdriver-agent status +``` + ### Example ```yaml From 098b0984e453a5c1c7b84f072f4a93f22cb12ac6 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 21 Jan 2024 18:16:51 -0800 Subject: [PATCH 069/151] Copies python binaries instead of symlink for more isolated venv --- modules/scripts/startup-script/files/install_ansible.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/scripts/startup-script/files/install_ansible.sh b/modules/scripts/startup-script/files/install_ansible.sh index ba75665eb9..7a619d5cc3 100644 --- a/modules/scripts/startup-script/files/install_ansible.sh +++ b/modules/scripts/startup-script/files/install_ansible.sh @@ -186,7 +186,7 @@ main() { fi # Create pip virtual environment for HPC Toolkit - ${python_path} -m venv "${venv_path}" + ${python_path} -m venv "${venv_path}" --copies venv_python_path=${venv_path}/bin/python3 # Upgrade pip if necessary From f87e38a7db5025a206baaaec63e6765c62be7d65 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 21 Jan 2024 22:30:48 -0800 Subject: [PATCH 070/151] Increase dynamic node count to a more reasonable default value --- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 3ba74f0637..f3fc27d3c1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -179,7 +179,7 @@ No modules. | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set | `string` | n/a | yes | | [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | +| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `10` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index a947c6a441..9f0313696d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -32,7 +32,7 @@ variable "node_count_static" { variable "node_count_dynamic_max" { description = "Maximum number of dynamic nodes allowed in this partition." type = number - default = 1 + default = 10 } ## VM Definition From f282a74f8ed1664cbd3035a3933880f4759b0be9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Jan 2024 11:45:12 +0000 Subject: [PATCH 071/151] Bump google.golang.org/api from 0.155.0 to 0.157.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.155.0 to 0.157.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.155.0...v0.157.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 21 ++++++++++----------- go.sum | 43 ++++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/go.mod b/go.mod index 570ad47bee..8df4663595 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,7 @@ require ( github.com/spf13/cobra v1.8.0 github.com/zclconf/go-cty v1.14.1 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect + google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -28,7 +28,7 @@ require ( github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.155.0 + google.golang.org/api v0.157.0 ) require ( @@ -50,16 +50,16 @@ require ( go.opentelemetry.io/otel/metric v1.21.0 // indirect go.opentelemetry.io/otel/trace v1.21.0 // indirect golang.org/x/mod v0.14.0 // indirect - golang.org/x/sync v0.5.0 // indirect + golang.org/x/sync v0.6.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) require ( - cloud.google.com/go v0.110.10 // indirect + cloud.google.com/go v0.111.0 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect cloud.google.com/go/iam v1.1.5 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect @@ -95,14 +95,13 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.17.0 // indirect - golang.org/x/net v0.19.0 // indirect - golang.org/x/oauth2 v0.15.0 // indirect + golang.org/x/crypto v0.18.0 // indirect + golang.org/x/net v0.20.0 // indirect + golang.org/x/oauth2 v0.16.0 // indirect golang.org/x/sys v0.16.0 golang.org/x/text v0.14.0 // indirect - golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.8 // indirect google.golang.org/grpc v1.60.1 // indirect - google.golang.org/protobuf v1.31.0 // indirect + google.golang.org/protobuf v1.32.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 8f7fc6ae94..b561ae67b4 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.110.10 h1:LXy9GEO+timppncPIAZoOj3l58LIU9k+kn48AN7IO3Y= -cloud.google.com/go v0.110.10/go.mod h1:v1OoFqYxiBkUrruItNM3eT4lLByNjxmJSV/xDKJNnic= +cloud.google.com/go v0.111.0 h1:YHLKNupSD1KqjDbQ3+LVdQ81h/UJbJyZG203cEfnQgM= +cloud.google.com/go v0.111.0/go.mod h1:0mibmpKP1TyOOFYQY5izo0LnT+ecvOQ0Sg3OdmMiNRU= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -514,6 +514,7 @@ go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= +go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o= go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= @@ -526,8 +527,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= -golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= +golang.org/x/crypto v0.18.0 h1:PGVlW0xEltQnzFZ55hkuX5+KLyrMYhHld1YHO4AKcdc= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -621,8 +622,8 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c= -golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= +golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -648,8 +649,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.15.0 h1:s8pnnxNVzjWyrvYdFUQq5llS1PX2zhPXmccZv99h7uQ= -golang.org/x/oauth2 v0.15.0/go.mod h1:q48ptWNTY5XWf+JNten23lcvHpLJ0ZSxF5ttTHKVCAM= +golang.org/x/oauth2 v0.16.0 h1:aDkGMBSYxElaoP81NpoUoz2oo2R2wHdZpGToUxfyQrQ= +golang.org/x/oauth2 v0.16.0/go.mod h1:hqZ+0LWXsiVoZpeld6jVt06P3adbS2Uu911W1SsJv2o= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -665,8 +666,8 @@ golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -746,7 +747,7 @@ golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.15.0 h1:y/Oo/a/q3IXu26lQgl04j/gjuBDOBlx7X6Om1j2CPW4= +golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -879,8 +880,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.155.0 h1:vBmGhCYs0djJttDNynWo44zosHlPvHmA0XiN2zP2DtA= -google.golang.org/api v0.155.0/go.mod h1:GI5qK5f40kCpHfPn6+YzGAByIKWv8ujFnmoWm7Igduk= +google.golang.org/api v0.157.0 h1:ORAeqmbrrozeyw5NjnMxh7peHO0UzV4wWYSwZeCUb20= +google.golang.org/api v0.157.0/go.mod h1:+z4v4ufbZ1WEpld6yMGHyggs+PmAHiaLNj5ytP3N01g= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -991,12 +992,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 h1:1hfbdAfFbkmpg41000wDVqr7jUpK/Yo+LPnIxxGzmkg= -google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3/go.mod h1:5RBcpGRxr25RbDzY5w+dmaqpSEvl8Gwl1x2CICf60ic= -google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3 h1:EWIeHfGuUf00zrVZGEgYFxok7plSAXBGcH7NNdMAWvA= -google.golang.org/genproto/googleapis/api v0.0.0-20231211222908-989df2bf70f3/go.mod h1:k2dtGpRrbsSyKcNPKKI5sstZkrNCZwpU/ns96JoHbGg= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0 h1:/jFB8jK5R3Sq3i/lmeZO0cATSzFfZaJq1J2Euan3XKU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20231212172506-995d672761c0/go.mod h1:FUoWkonphQm3RhTS+kOEhF8h0iDpm4tdXolVCeZ9KKA= +google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 h1:nz5NESFLZbJGPFxDT/HCn+V1mZ8JGNoY4nUpmW/Y2eg= +google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917/go.mod h1:pZqR+glSb11aJ+JQcczCvgf47+duRuzNSKqE8YAQnV0= +google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917 h1:rcS6EyEaoCO52hQDupoSfrxI3R6C2Tq741is7X8OvnM= +google.golang.org/genproto/googleapis/api v0.0.0-20240102182953-50ed04b92917/go.mod h1:CmlNWB9lSezaYELKS5Ym1r44VrrbPUa7JTvw+6MbpJ0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac h1:nUQEQmH/csSvFECKYRv6HWEyypysidKl2I6Qpsglq/0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240116215550-a9fa1716bcac/go.mod h1:daQN87bsDqDoe316QbbvX60nMoJQa4r6Ds0ZuoAe5yA= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1050,8 +1051,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= +google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 021d0e2b808c5bffcb5d9899e8f04bb98fe9ed66 Mon Sep 17 00:00:00 2001 From: Mark Olson <115657904+mark-olson@users.noreply.github.com> Date: Mon, 22 Jan 2024 09:43:03 -0800 Subject: [PATCH 072/151] Update README.md with fixes from review. Signed-off-by: Mark Olson <115657904+mark-olson@users.noreply.github.com> --- community/examples/intel/README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 8b0f4072d8..961000ac83 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -204,7 +204,7 @@ ls -lh "${HOME}/daos/cont1" A work-around for this issue to disable caching when mounting the container. -``` +```bash dfuse --singlethread --disable-caching --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" ``` @@ -234,7 +234,7 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d Delete the remaining infrastructure -```shell +```bash ghpc destroy pfs-daos --auto-approve ``` @@ -436,16 +436,15 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d ### Delete the DAOS/Slurm Cluster infrastructure when not in use -> **_NOTE:_** All data on the DAOS file system will be permanently lost after cluster deletion. - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale instances -> are destroyed those instances will be left running. +> **Note:** +> - Data on the DAOS file system will be permanently lost after cluster deletion. +> - If the Slurm controller is shut down before the auto-scale instances are destroyed, those compute instances will be left running. Open your browser to the VM instances page and ensure that instances named "compute" have been shutdown and deleted by the Slurm autoscaler. Delete the remaining infrastructure: -```shell +```bash ghpc destroy hpc-slurm-daos --auto-approve ``` From 7b7571992db59151fdb04b214391219ec56da829 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 21 Jan 2024 22:41:53 -0800 Subject: [PATCH 073/151] Add example of building Slurm on top of Rocky 8 Tom provided an example blueprint that demonstrated this methodology. Co-authored-by: Tom Downes --- community/examples/hpc-build-slurm-image.yaml | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 community/examples/hpc-build-slurm-image.yaml diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml new file mode 100644 index 0000000000..d978fdf341 --- /dev/null +++ b/community/examples/hpc-build-slurm-image.yaml @@ -0,0 +1,116 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blueprint_name: hpc-build-slurm-image + +vars: + project_id: ns-playground-2023-01-19 ## Set GCP Project ID Here ## + deployment_name: build-slurm-1 + region: us-central1 + zone: us-central1-a + + image_build_machine_type: n2d-standard-32 + build_from_image_family: hpc-rocky-linux-8 + build_from_image_project: cloud-hpc-image-public + built_image_family: my-custom-slurm + built_instance_image: + family: $(vars.built_image_family) + project: $(vars.project_id) + instance_image_custom: true + +deployment_groups: +- group: setup + modules: + - id: network + source: modules/network/vpc + + - id: slurm-build-script + source: modules/scripts/startup-script + settings: + runners: + - type: shell + destination: prep-for-slurm-build.sh + content: | + #!/bin/bash + set -e -o pipefail + # Slurm build on Rocky8 will upgrade to python38 as part of build + # This is not compatible with ansible-local runner + dnf install -y python38 + alternatives --set python3 /usr/bin/python3.8 + python3 -m pip install pip --upgrade + python3 -m pip install ansible + python3 -m pip install selinux + export PATH=/usr/local/bin:$PATH + ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents + - type: data + destination: /var/tmp/slurm_vars.json + content: | + { + "reboot": false, + "slurm_version": "23.02.5", + "install_cuda": false, + "nvidia_version": "latest", + "install_ompi": true, + "install_lustre": false, + "install_gcsfuse": true + } + - type: shell + destination: install_slurm.sh + content: | + #!/bin/bash + set -e -o pipefail + ansible-pull \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C master \ + -i localhost, --limit localhost --connection=local \ + -e @/var/tmp/slurm_vars.json \ + ansible/playbook.yml + +- group: build-slurm + modules: + - id: slurm-custom-image + source: modules/packer/custom-image + kind: packer + settings: + machine_type: $(vars.image_build_machine_type) + source_image_family: $(vars.build_from_image_family) + source_image_project_id: [$(vars.build_from_image_project)] + image_family: $(vars.built_image_family) + use: + - network + - slurm-build-script + +- group: demo-cluster + modules: + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + machine_type: n2d-standard-2 + instance_image: $(vars.built_instance_image) + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset] + settings: + partition_name: debug + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - debug_partition + settings: + machine_type: n2d-standard-4 + instance_image: $(vars.built_instance_image) From c283d736095fd83c464452650ab549fbe245f656 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 17 Jan 2024 23:26:24 +0000 Subject: [PATCH 074/151] Update hpc slurm gromac example and references to use Slurm V6 --- community/examples/hpc-slurm-gromacs.yaml | 73 +++++++++++-------- .../test-validation/test-spack.yml | 4 +- .../daily-tests/tests/spack-gromacs.yml | 11 ++- 3 files changed, 53 insertions(+), 35 deletions(-) diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index 9a45afc97d..c4f8780250 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -29,7 +29,7 @@ deployment_groups: - group: primary modules: - id: network1 - source: modules/network/pre-existing-vpc + source: modules/network/vpc ## Filesystems - id: appsfs @@ -49,6 +49,7 @@ deployment_groups: source: community/modules/scripts/spack-setup settings: install_dir: /sw/spack + spack_ref: v0.20.0 - id: spack-execute source: community/modules/scripts/spack-execute @@ -68,11 +69,11 @@ deployment_groups: projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' commands: | - # Un-comment and update mirror_url to install from spack cache - # if ! spack mirror list | grep -q gcs_cache; then - # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket - # fi - # spack buildcache keys --install --trust + ## Un-comment and update mirror_url to install from spack cache + ## if ! spack mirror list | grep -q gcs_cache; then + ## spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + ## fi + ## spack buildcache keys --install --trust spack config --scope defaults add config:build_stage:/sw/spack/spack-stage spack config --scope defaults add -f /tmp/projections-config.yaml @@ -82,38 +83,52 @@ deployment_groups: spack compiler find --scope site spack install intel-mpi@2018.4.274%gcc@10.3.0 - spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 + spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@8.5.0 - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs - - appsfs + - id: script + source: modules/scripts/startup-script settings: - partition_name: compute - max_node_count: 20 + runners: + # remove lustre client temporary to avoid startup failure due to known + # issue. + - type: shell + destination: remove_lustre_client.sh + content: | + #!/bin/bash + rm /etc/yum.repos.d/lustre-client.repo + - $(spack-execute.spack_runner) + + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - appsfs - - compute_partition + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs, appsfs] settings: - login_node_count: 1 + partition_name: compute + is_default: true - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - compute_partition + - slurm_login - homefs - appsfs - - slurm_controller - - spack-execute settings: - login_machine_type: c2-standard-4 - login_scopes: - - https://www.googleapis.com/auth/cloud-platform + disable_controller_public_ips: false + login_startup_script: $(script.startup_script) + login_startup_scripts_timeout: 21600 diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml index b09d1a49a1..d5e435c04b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-spack.yml @@ -17,8 +17,8 @@ - name: Include wait for startup script ansible.builtin.include_tasks: "tasks/wait-for-startup-script.yml" vars: - vm_name: "{{ login_node }}" - timeout_seconds: 7200 + vm_name: "{{ image_name }}" + timeout_seconds: 21600 - name: Ensure spack is installed ansible.builtin.command: spack --version changed_when: False diff --git a/tools/cloud-build/daily-tests/tests/spack-gromacs.yml b/tools/cloud-build/daily-tests/tests/spack-gromacs.yml index 412ade50d4..0fd0a596d2 100644 --- a/tools/cloud-build/daily-tests/tests/spack-gromacs.yml +++ b/tools/cloud-build/daily-tests/tests/spack-gromacs.yml @@ -15,14 +15,17 @@ --- test_name: hpc-slurm-gromacs -deployment_name: "spack-gromacs-{{ build }}" +deployment_name: "groma-{{ build }}" +slurm_cluster_name: "groma{{ build[0:5] }}" zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-gromacs.yaml" -network: "default" +network: "{{ deployment_name }}-net" max_nodes: 5 -login_node: slurm-{{ deployment_name }}-login0 -controller_node: slurm-{{ deployment_name }}-controller +login_node: "{{ slurm_cluster_name }}-login-*" +# Image name to be used to filter logs from /var/log/messages for startup script. +image_name: "slurm-gcp-dev-hpc-rocky-linux-8-*" +controller_node: "{{ slurm_cluster_name }}-controller" post_deploy_tests: - test-validation/test-spack.yml custom_vars: From 568ccaea90c8a0764b59016be9f06feb1f653aaa Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 22 Jan 2024 20:25:15 -0800 Subject: [PATCH 075/151] Clarify that zone-finding isn't available for TPUs (#2156) --- .../modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index f5219c6831..ac5dd7a4b3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -70,7 +70,7 @@ No resources. | [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | n/a | yes | | [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | `"2.9.1"` | no | -| [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | +| [zone](#input\_zone) | Zone in which to create compute VMs. TPU partitions can only specify a single zone. | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 0295f596a2..a23e966c56 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -85,7 +85,7 @@ variable "preserve_tpu" { } variable "zone" { - description = "Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones." + description = "Zone in which to create compute VMs. TPU partitions can only specify a single zone." type = string } From f9ab5c9d2840c6e11bbaa916e9f5e620ac07e485 Mon Sep 17 00:00:00 2001 From: Mark Olson <115657904+mark-olson@users.noreply.github.com> Date: Tue, 23 Jan 2024 09:35:01 -0800 Subject: [PATCH 076/151] Fixed PyMarkdown issue in community/examples/intel/README.md Signed-off-by: Mark Olson <115657904+mark-olson@users.noreply.github.com> --- community/examples/intel/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 961000ac83..932f9e3b52 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -436,7 +436,8 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d ### Delete the DAOS/Slurm Cluster infrastructure when not in use -> **Note:** +> **_NOTE:_** +> > - Data on the DAOS file system will be permanently lost after cluster deletion. > - If the Slurm controller is shut down before the auto-scale instances are destroyed, those compute instances will be left running. From 31db3c4158849bd94d40adb891f7b7acd9198af4 Mon Sep 17 00:00:00 2001 From: Mark Olson <115657904+mark-olson@users.noreply.github.com> Date: Tue, 23 Jan 2024 09:41:29 -0800 Subject: [PATCH 077/151] Fixed typos in community/examples/intel/README.md Signed-off-by: Mark Olson <115657904+mark-olson@users.noreply.github.com> --- community/examples/intel/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 932f9e3b52..0679b5fb01 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -149,7 +149,7 @@ Set ACLs to allow any user to create a container in *pool1*. sudo dmg pool update-acl -e A::EVERYONE@:rcta pool1 ``` -See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the of the DAOS Administration Guide for more information about creating pools. +See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the DAOS Administration Guide for more information about creating pools. #### Create a Container @@ -162,7 +162,7 @@ For the purpose of this demo create the container without specifying ACLs. The c daos container create --type=POSIX --properties=rf:0 pool1 cont1 ``` -See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the of the DAOS User Guide for more information about creating containers. +See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the DAOS User Guide for more information about creating containers. #### Mount the DAOS Container @@ -212,7 +212,7 @@ See the [File System](https://docs.daos.io/v2.4/user/filesystem/) section of the ### Unmount the DAOS Container -The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. +The container will need to be unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. Verify that the container is unmounted From a14bee50775940bfd778efe786d9134c5cefed2c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 23 Jan 2024 11:58:06 -0800 Subject: [PATCH 078/151] Bring `$(...)` functionality on par with `((...))` (#2053) * Use token-replacement instead of string-replacement for expresssion updates; * Translate any BP-expression to TF-expressions by transforming used traversals; * Remove notion of `((...))` from documentation. --- .../google/wait-for-startup/README.md | 2 +- examples/README.md | 80 ++++--------- pkg/config/expression.go | 112 ++++++++++++------ pkg/config/expression_test.go | 50 +++++++- pkg/modulewriter/modulewriter_test.go | 3 +- pkg/modulewriter/tfwriter.go | 39 ++++-- pkg/shell/terraform.go | 5 +- .../daily-tests/blueprints/lustre-vm.yaml | 6 +- .../daily-tests/builds/batch-mpi.yaml | 2 +- 9 files changed, 184 insertions(+), 115 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md b/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md index 7d4032f1ab..c8561bc909 100644 --- a/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md +++ b/community/front-end/ofe/infrastructure_files/workbench_tf/google/wait-for-startup/README.md @@ -19,7 +19,7 @@ up a node. kind: terraform id: wait settings: - instance_name: ((module.workstation.name[0])) + instance_name: $(workstation.name[0]) ``` ## License diff --git a/examples/README.md b/examples/README.md index 4d7d9ad79a..50d928393f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -48,10 +48,9 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [Top Level Parameters](#top-level-parameters) * [Deployment Variables](#deployment-variables) * [Deployment Groups](#deployment-groups) -* [Variables](#variables) - * [Blueprint Variables](#blueprint-variables) - * [Literal Variables](#literal-variables) - * [Escape Variables](#escape-variables) +* [Variables and expressions](#variables-and-expressions) + * [Blueprint expressions](#blueprint-expressions) + * [Escape expressions](#escape-expressions) ## Instructions @@ -1193,19 +1192,20 @@ default in the [modules](../modules/README.md) folder. To learn more about how to refer to a module in a blueprint file, please consult the [modules README file.](../modules/README.md) -## Variables +## Variables and expressions Variables can be used to refer both to values defined elsewhere in the blueprint and to the output and structure of other modules. -### Blueprint Variables +### Blueprint expressions -Variables in a blueprint file can refer to deployment variables or the outputs -of other modules. For deployment and module variables, the syntax is as follows: +Expressions in a blueprint file can refer to deployment variables or the outputs +of other modules. The entire expression is wrapped in `$()`. The syntax is as follows: ```yaml vars: zone: us-central1-a + num_nodes: 2 deployment_groups: - group: primary @@ -1219,53 +1219,23 @@ deployment_groups: settings: key1: $(vars.zone) key2: $(resource1.name) + # access nested fields + key3: $(resource1.nodes[0].private_ip) + # arithmetic expression + key4: $(vars.num_nodes + 5) + # string interpolation + key5: $(resource1.name)_$(vars.zone) + # multiline string interpolation + key6: | + #!/bin/bash + echo "Hello $(vars.project_id) from $(vars.region)" + # use a function, supported by Terraform + key7: $(jsonencode(resource1.config)) ``` -The variable is referred to by the source, either vars for deploment variables -or the module ID for module variables, followed by the name of the value being -referenced. The entire variable is then wrapped in “$()”. - -### Literal Variables - -Literal variables should only be used by those familiar -with the underlying module technology (Terraform or Packer); -Literal variables are occasionally needed when calling a function or other complex statements. For example, to JSON-encode network storage metadata: - -```yaml -metadata: - network_storage: ((jsonencode([module.appfs.network_storage]))) -``` - -Here the network1 module is referenced, the terraform module name is the same as -the ID in the blueprint file. From the module we can refer to it's underlying -variables as deep as we need, in this case the self_link for it's -primary_subnetwork. - -The entire text of the variable is wrapped in double parentheses indicating that -everything inside will be provided as is to the module. - -Whenever possible, blueprint variables are preferred over literal variables. -`ghpc` will perform basic validation making sure all blueprint variables are -defined before creating a deployment, making debugging quicker and easier. - -### String Interpolation - -The `$(...)` expressions can be used within strings, see: - -```yaml -settings: - title: Magnificent $(vars.name) - script: | - #!/bin/bash - echo "Hello $(vars.project_id) from $(vars.region)" -``` - -### Escape Variables - -Under circumstances where the variable notation conflicts with the content of a setting or string, for instance when defining a startup-script runner that uses a subshell like in the example below, a non-quoted backslash (`\`) can be used as an escape character. It preserves the literal value of the next character that follows: +### Escape expressions -* `\$(not.bp_var)` evaluates to `$(not.bp_var)`. -* `\((not.literal_var))` evaluates to `((not.literal_var))`. +Under circumstances where the expression notation conflicts with the content of a setting or string, for instance when defining a startup-script runner that uses a subshell like in the example below, a non-quoted backslash (`\`) can be used as an escape character. It preserves the literal value of the next character that follows: `\$(not.bp_var)` evaluates to `$(not.bp_var)`. ```yaml deployment_groups: @@ -1273,12 +1243,6 @@ deployment_groups: modules: - id: resource1 source: path/to/module/1 - settings: - key1: \((not.literal_var)) ## Evaluates to "((not.literal_var))". - ... - - id: resource2 - source: path/to/module/2 - ... settings: key1: | #!/bin/bash diff --git a/pkg/config/expression.go b/pkg/config/expression.go index d3a5d4dc99..bf74670551 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -15,6 +15,7 @@ package config import ( + "bytes" "fmt" "regexp" "strings" @@ -56,52 +57,49 @@ func (r Reference) AsExpression() Expression { } // Takes traversal in "blueprint namespace" (e.g. `vars.zone` or `homefs.mount`) -// and transforms it to `Expression`. -func bpTraversalToExpression(t hcl.Traversal) (Expression, error) { +// and transforms it to "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`). +func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) { if len(t) < 2 { return nil, fmt.Errorf(expectedVarFormat) } - attr, ok := t[1].(hcl.TraverseAttr) + _, ok := t[1].(hcl.TraverseAttr) if !ok { return nil, fmt.Errorf(expectedVarFormat) } - var ref Reference if t.RootName() == "vars" { - t[0] = hcl.TraverseRoot{Name: "var"} - ref = GlobalRef(attr.Name) + root := hcl.TraverseRoot{Name: "var"} + return append([]hcl.Traverser{root}, t[1:]...), nil } else { - mod := t.RootName() - t[0] = hcl.TraverseAttr{Name: mod} root := hcl.TraverseRoot{Name: "module"} - t = append(hcl.Traversal{root}, t...) - ref = ModuleRef(ModuleID(mod), attr.Name) + mod := hcl.TraverseAttr{Name: t.RootName()} + return append([]hcl.Traverser{root, mod}, t[1:]...), nil } - - return &BaseExpression{ - e: &hclsyntax.ScopeTraversalExpr{Traversal: t}, - toks: hclwrite.TokensForTraversal(t), - rs: []Reference{ref}, - }, nil } -// bpLitToExpression takes a content of `$(...)`-literal and transforms it to `Expression` -func bpLitToExpression(s string) (Expression, error) { - hexp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) +// BlueprintExpressionLiteralToExpression takes a content of `$(...)`-literal and transforms it to `Expression` +func BlueprintExpressionLiteralToExpression(s string) (Expression, error) { + bpExp, diag := hclsyntax.ParseExpression([]byte(s), "", hcl.Pos{}) if diag.HasErrors() { return nil, diag } - - switch texp := hexp.(type) { - case *hclsyntax.ScopeTraversalExpr: - exp, err := bpTraversalToExpression(texp.Traversal) + toks, err := parseHcl(s) + if err != nil { + return nil, err + } + for _, t := range bpExp.Variables() { + new, err := bpTraversalToTerraform(t) if err != nil { - return nil, fmt.Errorf("failed to parse variable %q: %w", s, err) + return nil, err } - return exp, nil - default: - return nil, fmt.Errorf("only traversal expressions are supported, got %q", s) + + toks = replaceTokens( + toks, + hclwrite.TokensForTraversal(t), + hclwrite.TokensForTraversal(new)) } + + return ParseExpression(string(toks.Bytes())) } // TraversalToReference takes HCL traversal and returns `Reference` @@ -157,6 +155,18 @@ type Expression interface { key() expressionKey } +func parseHcl(s string) (hclwrite.Tokens, error) { + sToks, diag := hclsyntax.LexExpression([]byte(s), "", hcl.Pos{}) + if diag.HasErrors() { + return nil, diag + } + wToks := make(hclwrite.Tokens, len(sToks)) + for i, st := range sToks { + wToks[i] = &hclwrite.Token{Type: st.Type, Bytes: st.Bytes} + } + return wToks, nil +} + // ParseExpression returns Expression // Expects expression in "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`) func ParseExpression(s string) (Expression, error) { @@ -164,10 +174,9 @@ func ParseExpression(s string) (Expression, error) { if diag.HasErrors() { return nil, diag } - sToks, _ := hclsyntax.LexExpression([]byte(s), "", hcl.Pos{}) - wToks := make(hclwrite.Tokens, len(sToks)) - for i, st := range sToks { - wToks[i] = &hclwrite.Token{Type: st.Type, Bytes: st.Bytes} + toks, err := parseHcl(s) + if err != nil { + return nil, err } ts := e.Variables() @@ -178,7 +187,7 @@ func ParseExpression(s string) (Expression, error) { return nil, err } } - return BaseExpression{e: e, toks: wToks, rs: rs}, nil + return BaseExpression{e: e, toks: toks, rs: rs}, nil } // MustParseExpression is "errorless" version of ParseExpression @@ -324,7 +333,6 @@ func TokensForValue(val cty.Value) hclwrite.Tokens { tl = append(tl, hclwrite.ObjectAttrTokens{Name: kt, Value: vt}) } return hclwrite.TokensForObject(tl) - } return hclwrite.TokensForValue(val) // rely on hclwrite implementation } @@ -463,7 +471,7 @@ func greedyParseHcl(s string) (Expression, string, error) { } _, diag := hclsyntax.ParseExpression([]byte(s[:i]), "", hcl.Pos{}) if !diag.HasErrors() { // found an expression - exp, err := bpLitToExpression(s[:i]) + exp, err := BlueprintExpressionLiteralToExpression(s[:i]) return exp, s[i+1:], err } err = diag // save error, try to find another closing bracket @@ -498,3 +506,39 @@ func buildStringInterpolation(pts []pToken) (Expression, error) { Bytes: []byte(`"`)}) return ParseExpression(string(toks.Bytes())) } + +func trimEOF(ts hclwrite.Tokens) hclwrite.Tokens { + if len(ts) > 0 && ts[len(ts)-1].Type == hclsyntax.TokenEOF { + return ts[:len(ts)-1] + } + return ts +} + +func replaceTokens(body, old, new hclwrite.Tokens) hclwrite.Tokens { + old, new = trimEOF(old), trimEOF(new) + if len(old) == 0 { + return body + } + + r := hclwrite.Tokens{} + + p := hclwrite.Tokens{} // matching prefix of `old` + for _, t := range body { + c := old[len(p)] + p = append(p, t) + if t.Type != c.Type || !bytes.Equal(t.Bytes, c.Bytes) { // t != c + r = append(r, p...) // stop comparison and flash prefix + p = hclwrite.Tokens{} + } + if len(p) == len(old) { // gathered enough tokens + p = hclwrite.Tokens{} + r = append(r, new...) + } + } + return append(r, p...) +} + +func ReplaceSubExpressions(body, old, new Expression) (Expression, error) { + r := replaceTokens(body.Tokenize(), old.Tokenize(), new.Tokenize()) + return ParseExpression(string(r.Bytes())) +} diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index e3cd7dc1d8..d8af75c209 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -15,6 +15,7 @@ package config import ( + "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -101,13 +102,12 @@ $(vars.here)`, `"4gold\n${var.here}"`, false}, // quoted strings may not be spli {`5gold was here`, `"5gold\nwas here"`, false}, - {"6gold $(vars.here", ``, true}, // missing close parenthesis - {"7gold $(vars.here + 2)", ``, true}, // unsupported expression + {"6gold $(vars.here", ``, true}, // missing close parenthesis {`#!/bin/bash echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hello ${var.project_id} from ${var.region}\""`, false}, {"", `""`, false}, - {`$(try(vars.this) + one(vars.time))`, "", true}, // fails because of unsupported expression, but it should be parsed + {`$(try(vars.this) + one(vars.time))`, "try(var.this)+one(var.time)", false}, // Escaping {`q $(vars.t)`, `"q ${var.t}"`, false}, // no escaping @@ -117,6 +117,13 @@ echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hell {`q \\\\$(vars.t)`, `"q \\\\${var.t}"`, false}, // escaped `\\` {`q \\\\\$(vars.t)`, `"q \\\\$(vars.t)"`, false}, // escaped both `\\` and `$(` + // Translation of complex expressions BP -> Terraform + {"$(vars.green + amber.blue)", "var.green+module.amber.blue", false}, + {"$(5 + vars.blue)", "5+var.blue", false}, + {"$(5)", "5", false}, + {`$("${vars.green}_${vars.sleeve}")`, `"${var.green}_${var.sleeve}"`, false}, + {"$(fun(vars.green))", "fun(var.green)", false}, + // Untranslatable expressions {"$(vars)", "", true}, {"$(sleeve)", "", true}, @@ -124,6 +131,7 @@ echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hell {`$(box["green"])`, "", true}, // can't index module {"$(vars[3]])", "", true}, // can't index vars {`$(vars["green"])`, "", true}, // can't index module + } for _, tc := range tests { t.Run(tc.input, func(t *testing.T) { @@ -223,3 +231,39 @@ func TestMergeFunctionCallExpression(t *testing.T) { t.Errorf("diff (-want +got):\n%s", diff) } } + +func TestReplaceTokens(t *testing.T) { + type test struct { + body string + old string + new string + want string + } + tests := []test{ + {"var.green", "var.green", "var.blue", "var.blue"}, + {"var.green + var.green", "var.green", "var.blue", "var.blue+var.blue"}, + {"vars.green + 5", "vars.green", "var.green", "var.green+5"}, + {"var.green + var.blue", "vars.gold", "var.silver", "var.green+var.blue"}, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("s/%s/%s/%s", tc.old, tc.new, tc.body), func(t *testing.T) { + b, err := parseHcl(tc.body) + if err != nil { + t.Fatal(err) + } + o, err := parseHcl(tc.old) + if err != nil { + t.Fatal(err) + } + n, err := parseHcl(tc.new) + if err != nil { + t.Fatal(err) + } + + got := replaceTokens(b, o, n) + if diff := cmp.Diff(tc.want, string(got.Bytes())); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index cd38aa07f5..2e716b0394 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -593,12 +593,13 @@ func (s *zeroSuite) TestSubstituteIgcReferencesInModule(c *C) { config.MustParseExpression(`module.golf.red + 6 + module.golf.green`).AsValue(), config.MustParseExpression(`module.tennis.brown`).AsValue(), })) - m := SubstituteIgcReferencesInModule( + m, err := SubstituteIgcReferencesInModule( config.Module{Settings: d}, map[config.Reference]modulereader.VarInfo{ config.ModuleRef("golf", "red"): {Name: "pink"}, config.ModuleRef("golf", "green"): {Name: "lime"}, }) + c.Assert(err, IsNil) c.Check(m.Settings.Items(), DeepEquals, map[string]cty.Value{"fold": cty.TupleVal([]cty.Value{ cty.StringVal("zebra"), config.MustParseExpression(`var.pink + 6 + var.lime`).AsValue(), diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index 563b15d015..60179f7223 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -308,7 +308,10 @@ func (w TFWriter) writeDeploymentGroup( } // Write main.tf file - doctoredModules := substituteIgcReferences(g.Modules, intergroupVars) + doctoredModules, err := substituteIgcReferences(g.Modules, intergroupVars) + if err != nil { + return fmt.Errorf("error substituting intergroup references in deployment group %s: %w", g.Name, err) + } if err := writeMain(doctoredModules, g.TerraformBackend, groupPath); err != nil { return fmt.Errorf("error writing main.tf file for deployment group %s: %w", g.Name, err) } @@ -403,36 +406,46 @@ func getUsedDeploymentVars(group config.DeploymentGroup, bp config.Blueprint) ma return filteredVars } -func substituteIgcReferences(mods []config.Module, igcRefs map[config.Reference]modulereader.VarInfo) []config.Module { +func substituteIgcReferences(mods []config.Module, igcRefs map[config.Reference]modulereader.VarInfo) ([]config.Module, error) { doctoredMods := make([]config.Module, len(mods)) for i, mod := range mods { - doctoredMods[i] = SubstituteIgcReferencesInModule(mod, igcRefs) + dm, err := SubstituteIgcReferencesInModule(mod, igcRefs) + if err != nil { + return nil, err + } + doctoredMods[i] = dm } - return doctoredMods + return doctoredMods, nil } // SubstituteIgcReferencesInModule updates expressions in Module settings to use // special IGC var name instead of the module reference -func SubstituteIgcReferencesInModule(mod config.Module, igcRefs map[config.Reference]modulereader.VarInfo) config.Module { - v, _ := cty.Transform(mod.Settings.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { +func SubstituteIgcReferencesInModule(mod config.Module, igcRefs map[config.Reference]modulereader.VarInfo) (config.Module, error) { + v, err := cty.Transform(mod.Settings.AsObject(), func(p cty.Path, v cty.Value) (cty.Value, error) { e, is := config.IsExpressionValue(v) if !is { return v, nil } - ue := string(e.Tokenize().Bytes()) - for _, r := range e.References() { + refs := e.References() + for _, r := range refs { oi, exists := igcRefs[r] if !exists { continue } - s := fmt.Sprintf("module.%s.%s", r.Module, r.Name) - rs := fmt.Sprintf("var.%s", oi.Name) - ue = strings.ReplaceAll(ue, s, rs) + old := r.AsExpression() + new := config.GlobalRef(oi.Name).AsExpression() + var err error + if e, err = config.ReplaceSubExpressions(e, old, new); err != nil { + return cty.NilVal, err + } } - return config.MustParseExpression(ue).AsValue(), nil + return e.AsValue(), nil }) + if err != nil { + return config.Module{}, err + } mod.Settings = config.NewDict(v.AsValueMap()) - return mod + return mod, nil } // FindIntergroupVariables returns all unique intergroup references made by diff --git a/pkg/shell/terraform.go b/pkg/shell/terraform.go index 2d51a078b7..a06a4b0a93 100644 --- a/pkg/shell/terraform.go +++ b/pkg/shell/terraform.go @@ -413,7 +413,10 @@ func ImportInputs(deploymentGroupDir string, artifactsDir string, expandedBluepr } igcVars := modulewriter.FindIntergroupVariables(g, bp) - newModule := modulewriter.SubstituteIgcReferencesInModule(config.Module{Settings: intergroupSettings}, igcVars) + newModule, err := modulewriter.SubstituteIgcReferencesInModule(config.Module{Settings: intergroupSettings}, igcVars) + if err != nil { + return err + } if err := mergeMapsWithoutLoss(inputs, bp.Vars.Items()); err != nil { return err diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml index b7880219bf..3d227e9c69 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml @@ -110,7 +110,7 @@ deployment_groups: - id: wait-centos source: community/modules/scripts/wait-for-startup settings: - instance_name: ((module.workstation-centos.name[0])) + instance_name: $(workstation-centos.name[0]) timeout: 7200 - id: workstation-rocky @@ -128,7 +128,7 @@ deployment_groups: - id: wait-rocky source: community/modules/scripts/wait-for-startup settings: - instance_name: ((module.workstation-rocky.name[0])) + instance_name: $(workstation-rocky.name[0]) timeout: 7200 # - id: workstation-ubuntu @@ -145,5 +145,5 @@ deployment_groups: # - id: wait-ubuntu # source: community/modules/scripts/wait-for-startup # settings: - # instance_name: ((module.workstation-ubuntu.name[0])) + # instance_name: $(workstation-ubuntu.name[0]) # timeout: 7200 diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index e6a6b589e9..618393c2c9 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -62,7 +62,7 @@ steps: echo ' - id: wait' >> $${SG_EXAMPLE} echo ' source: community/modules/scripts/wait-for-startup' >> $${SG_EXAMPLE} echo ' settings:' >> $${SG_EXAMPLE} - echo ' instance_name: ((module.spack-builder.name[0]))' >> $${SG_EXAMPLE} + echo ' instance_name: $(spack-builder.name[0])' >> $${SG_EXAMPLE} echo ' timeout: 2400' >> $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ From 858f9f3d9a6544ae103a5c830e3ec0e1d3b988ae Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 12:13:15 -0800 Subject: [PATCH 079/151] Address feedback from #2150 --- modules/scripts/startup-script/README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 7d9e027943..e88bd6301f 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -165,12 +165,25 @@ You can test if one of the agents is running using the following commands: ```bash # For Cloud Ops Agent -sudo systemctl status google-cloud-ops-agent - -# For Stackdriver Agent -sudo service stackdriver-agent status +$ sudo systemctl is-active google-cloud-ops-agent"*" +active +active +active +active + +# For Legacy Monitoring and Logging Agents +$ sudo service stackdriver-agent status +stackdriver-agent is running [ OK ] +$ sudo service google-fluentd status +google-fluentd is running [ OK ] ``` +For official documentation see troubleshooting docs: + +- [Cloud Ops Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/troubleshoot-install-startup) +- [Legacy Monitoring Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/monitoring/troubleshooting) +- [Legacy Logging Agent](https://cloud.google.com/stackdriver/docs/solutions/agents/logging/troubleshooting) + ### Example ```yaml From 582bcc35e5c4763aa2839a5098162b9bc9e8ff55 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 12:14:23 -0800 Subject: [PATCH 080/151] Start legacy monitoring agent after installing --- .../scripts/startup-script/files/install_monitoring_agent.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/scripts/startup-script/files/install_monitoring_agent.sh b/modules/scripts/startup-script/files/install_monitoring_agent.sh index 569ed478c6..eb4bf899b8 100644 --- a/modules/scripts/startup-script/files/install_monitoring_agent.sh +++ b/modules/scripts/startup-script/files/install_monitoring_agent.sh @@ -71,6 +71,7 @@ handle_debian() { install_with_retry "${LEGACY_MONITORING_SCRIPT_URL}" install_with_retry "${LEGACY_LOGGING_SCRIPT_URL}" service stackdriver-agent start + service google-fluentd start } } @@ -102,6 +103,7 @@ handle_redhat() { curl -sS "${LEGACY_MONITORING_SCRIPT_URL}" | bash -s -- --also-install curl -sS "${LEGACY_LOGGING_SCRIPT_URL}" | bash -s -- --also-install service stackdriver-agent start + service google-fluentd start } } From de4bd119c9fe0a6ab8b3a12b0c00de3808d6e32b Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 13:26:51 -0800 Subject: [PATCH 081/151] Address feedback: be explicit about Ansible install --- community/examples/hpc-build-slurm-image.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index d978fdf341..7fb3e1b25d 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -38,6 +38,8 @@ deployment_groups: - id: slurm-build-script source: modules/scripts/startup-script settings: + # Do not create Ansible virtual env; Install system wide Ansible below. + install_ansible: false runners: - type: shell destination: prep-for-slurm-build.sh @@ -49,9 +51,10 @@ deployment_groups: dnf install -y python38 alternatives --set python3 /usr/bin/python3.8 python3 -m pip install pip --upgrade - python3 -m pip install ansible + python3 -m pip install ansible==4.10.0 python3 -m pip install selinux export PATH=/usr/local/bin:$PATH + ansible --version ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents - type: data destination: /var/tmp/slurm_vars.json From 236366f872f4b8e666a49f5803b00438545efe5c Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 17:11:57 -0800 Subject: [PATCH 082/151] Add documentation for Slurm building example --- examples/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/examples/README.md b/examples/README.md index 4d7d9ad79a..57f11505fd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -20,6 +20,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] + * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml-) ![community-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml-) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] @@ -579,6 +580,25 @@ For this example the following is needed in the selected region: [cae-slurm.yaml]: ../examples/cae/cae-slurm.yaml +### [hpc-build-slurm-image.yaml] ![community-badge] + +This blueprint demonstrates how to use HPC Toolkit to build a Slurm image on top +of an existing image, `hpc-rocky-linux-8` in the case of this example. + +The blueprint contains 3 groups: + +1. The first group creates a network and generates the scripts that will install + Slurm. This uses the Ansible Playbook contained in the + [Slurm on GCP](https://github.com/GoogleCloudPlatform/slurm-gcp) repo. +2. The second group executes the build using Packer to run the scripts from the + first group. This can take ~30 min and will generate a custom Slurm image in + your project. +3. The third group deploys a demo cluster that uses the newly built image. For a + real world use case the demo cluster can be swapped out for a more powerful + slurm cluster from other examples. + +[hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml + ### [hpc-slurm-ubuntu2004.yaml] ![community-badge] > **Warning**: The variables `enable_reconfigure`, From 790b45cc7e91b4c80d5838cfa331c7ea3136a963 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 17:44:37 -0800 Subject: [PATCH 083/151] Adding test for building slurm image --- .../builds/hpc-build-slurm-image.yaml | 53 +++++++++++++++++++ .../tests/hpc-build-slurm-image.yml | 25 +++++++++ 2 files changed, 78 insertions(+) create mode 100644 tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml create mode 100644 tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml diff --git a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml new file mode 100644 index 0000000000..d71788e2a6 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml @@ -0,0 +1,53 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 5400s # 1.5h +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +- id: hpc-build-slurm-image + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" diff --git a/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml new file mode 100644 index 0000000000..f6f2b93ef0 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml @@ -0,0 +1,25 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: hpc-build-slurm-image +deployment_name: build-slurm-{{ build }} +zone: us-central1-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-build-slurm-image.yaml" +network: "{{ deployment_name }}-net" +cli_deployment_vars: + network_name: "{{ network }}" + subnetwork_name: "{{ network }}-sub" From 128b33bcca365a5c0a04ed28f8e49d1ebebed65a Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 19:36:30 -0800 Subject: [PATCH 084/151] Create variable to pass the Packer group name --- .../ansible_playbooks/multigroup-integration-test.yml | 2 +- tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml | 2 ++ tools/cloud-build/daily-tests/tests/packer.yml | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index af3383d517..3245b2a0f5 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -49,7 +49,7 @@ set -e -o pipefail gcloud compute images delete --project={{ project }} --quiet $(jq -r '.builds[-1].artifact_id' packer-manifest.json | cut -d ":" -f2) args: - chdir: "{{ workspace }}/{{ deployment_name }}/packer/custom-image" + chdir: "{{ workspace }}/{{ deployment_name }}/{{ packer_group_name }}/{{ packer_module_id }}" executable: /bin/bash - name: Trigger Cloud Build failure when: ghpc_destroy.failed or image_deletion.failed diff --git a/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml index f6f2b93ef0..deb3d3eedc 100644 --- a/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml +++ b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml @@ -20,6 +20,8 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/hpc-build-slurm-image.yaml" network: "{{ deployment_name }}-net" +packer_group_name: build-slurm +packer_module_id: slurm-custom-image cli_deployment_vars: network_name: "{{ network }}" subnetwork_name: "{{ network }}-sub" diff --git a/tools/cloud-build/daily-tests/tests/packer.yml b/tools/cloud-build/daily-tests/tests/packer.yml index 54fb9ddd28..70dd6c9597 100644 --- a/tools/cloud-build/daily-tests/tests/packer.yml +++ b/tools/cloud-build/daily-tests/tests/packer.yml @@ -20,6 +20,8 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/image-builder.yaml" network: "{{ deployment_name }}-net" +packer_group_name: packer +packer_module_id: custom-image cli_deployment_vars: network_name: "{{ network }}" subnetwork_name: "{{ network }}-sub" From 8fc78d30c68a6053d36c011176cc8171e5ce8d20 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 23 Jan 2024 19:39:02 -0800 Subject: [PATCH 085/151] Fix: old ansible was not compatable with selinux package, pin to latest --- community/examples/hpc-build-slurm-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 7fb3e1b25d..6a82fe8d9f 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -51,7 +51,7 @@ deployment_groups: dnf install -y python38 alternatives --set python3 /usr/bin/python3.8 python3 -m pip install pip --upgrade - python3 -m pip install ansible==4.10.0 + python3 -m pip install ansible==6.7.0 python3 -m pip install selinux export PATH=/usr/local/bin:$PATH ansible --version From f03c2ca7d46300aa028a05c542eaa79d284e4648 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jan 2024 15:41:27 -0800 Subject: [PATCH 086/151] Fix false-positive `test_deployment_variable_not_used` (#2164) Preserve original state of `Vars` --- pkg/config/config.go | 14 ++++++++------ pkg/config/config_test.go | 33 +++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 39087a31d2..0b733cc015 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -245,6 +245,10 @@ type Blueprint struct { Vars Dict DeploymentGroups []DeploymentGroup `yaml:"deployment_groups"` TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` + + // Preserves the original values of `Vars` (as defined by the user), + // while `Vars` can mutate (add `labels`, evaluate values). + origVars Dict } // DeploymentConfig is a container for the imported YAML data and supporting data for @@ -255,6 +259,7 @@ type DeploymentConfig struct { // ExpandConfig expands the yaml config in place func (dc *DeploymentConfig) ExpandConfig() error { + dc.Config.origVars = NewDict(dc.Config.Vars.Items()) // copy dc.Config.setGlobalLabels() dc.Config.addKindToModules() @@ -313,7 +318,7 @@ func GetUsedDeploymentVars(val cty.Value) []string { func (bp Blueprint) ListUnusedVariables() []string { // Gather all scopes where variables are used ns := map[string]cty.Value{ - "vars": bp.Vars.AsObject(), + "vars": bp.origVars.AsObject(), } bp.WalkModulesSafe(func(_ ModulePath, m *Module) { ns["module_"+string(m.ID)] = m.Settings.AsObject() @@ -322,22 +327,19 @@ func (bp Blueprint) ListUnusedVariables() []string { ns["validator_"+v.Validator] = v.Inputs.AsObject() } - // these variables are required or automatically added; var used = map[string]bool{ - "labels": true, - "deployment_name": true, + "deployment_name": true, // required => always used } for _, v := range GetUsedDeploymentVars(cty.ObjectVal(ns)) { used[v] = true } unused := []string{} - for k := range bp.Vars.Items() { + for k := range bp.origVars.Items() { if _, ok := used[k]; !ok { unused = append(unused, k) } } - return unused } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 99ec466f3c..a04ff26a24 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -343,18 +343,27 @@ func (s *zeroSuite) TestListUnusedModules(c *C) { } } -func (s *MySuite) TestListUnusedVariables(c *C) { - dc := s.getDeploymentConfigForTest() - dc.applyGlobalVariables() - - unusedVars := dc.Config.ListUnusedVariables() - c.Assert(unusedVars, DeepEquals, []string{"project_id"}) - - dc = s.getMultiGroupDeploymentConfig() - dc.applyGlobalVariables() - - unusedVars = dc.Config.ListUnusedVariables() - c.Assert(unusedVars, DeepEquals, []string{"unused_key"}) +func (s *zeroSuite) TestListUnusedVariables(c *C) { + bp := Blueprint{ + Vars: NewDict(map[string]cty.Value{ + "deployment_name": cty.StringVal("green"), + "flathead_screw": cty.NumberIntVal(1), + "pony": cty.NumberIntVal(2), + "stripes": cty.NumberIntVal(3), + "zebra": MustParseExpression("var.pony + var.stripes").AsValue(), + }), + DeploymentGroups: []DeploymentGroup{{Modules: []Module{{ + Settings: NewDict(map[string]cty.Value{ + "circus": GlobalRef("pony").AsExpression().AsValue(), + }), + }}}}, + Validators: []Validator{{ + Inputs: NewDict(map[string]cty.Value{ + "savannah": GlobalRef("zebra").AsExpression().AsValue(), + })}}} + bp.origVars = NewDict(bp.Vars.Items()) + + c.Check(bp.ListUnusedVariables(), DeepEquals, []string{"flathead_screw"}) } func (s *zeroSuite) TestAddKindToModules(c *C) { From f8c497fd50b0699283d6927049595f00e6807bc9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jan 2024 15:41:44 -0800 Subject: [PATCH 087/151] Bump test coverage for `pkg/modulewriter` (#2163) --- pkg/modulewriter/hcl_utils.go | 15 +--- pkg/modulewriter/modulewriter_test.go | 42 ++++------ pkg/modulewriter/tfversions.go | 32 -------- pkg/modulewriter/tfwriter.go | 111 +++++++++----------------- 4 files changed, 54 insertions(+), 146 deletions(-) delete mode 100644 pkg/modulewriter/tfversions.go diff --git a/pkg/modulewriter/hcl_utils.go b/pkg/modulewriter/hcl_utils.go index 74e0f33cb2..891d421867 100644 --- a/pkg/modulewriter/hcl_utils.go +++ b/pkg/modulewriter/hcl_utils.go @@ -15,9 +15,6 @@ package modulewriter import ( - "fmt" - "path/filepath" - "hpc-toolkit/pkg/config" "github.com/hashicorp/hcl/v2/hclwrite" @@ -26,10 +23,6 @@ import ( // WriteHclAttributes writes tfvars/pkvars.hcl files func WriteHclAttributes(vars map[string]cty.Value, dst string) error { - if err := createBaseFile(dst); err != nil { - return fmt.Errorf("error creating variables file %v: %v", filepath.Base(dst), err) - } - hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() for _, k := range orderKeys(vars) { @@ -37,11 +30,5 @@ func WriteHclAttributes(vars map[string]cty.Value, dst string) error { toks := config.TokensForValue(vars[k]) hclBody.SetAttributeRaw(k, toks) } - - hclBytes := hclFile.Bytes() - err := appendHCLToFile(dst, hclBytes) - if err != nil { - return fmt.Errorf("error writing HCL to %v: %v", filepath.Base(dst), err) - } - return err + return writeHclFile(dst, hclFile) } diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 2e716b0394..9e9ce55b78 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -263,12 +263,15 @@ func TestGetTypeTokensRelaxed(t *testing.T) { } } -func (s *MySuite) TestCreateBaseFile(c *C) { +func (s *MySuite) TestWriteHclFile(c *C) { + hclF := hclwrite.NewEmptyFile() + hclF.Body().SetAttributeValue("zebra", cty.NumberIntVal(0)) + // Success baseFilename := "main.tf_TestCreateBaseFile" goodPath := filepath.Join(s.testDir, baseFilename) - err := createBaseFile(goodPath) - c.Assert(err, IsNil) + c.Assert(writeHclFile(goodPath, hclF), IsNil) + fi, err := os.Stat(goodPath) c.Assert(err, IsNil) c.Assert(fi.Name(), Equals, baseFilename) @@ -277,26 +280,11 @@ func (s *MySuite) TestCreateBaseFile(c *C) { b, _ := os.ReadFile(goodPath) c.Assert(strings.Contains(string(b), "Licensed under the Apache License"), Equals, true) + c.Assert(strings.Contains(string(b), "zebra"), Equals, true) // Error: not a correct path fakePath := filepath.Join("not/a/real/dir", "main.tf_TestCreateBaseFile") - err = createBaseFile(fakePath) - c.Assert(err, ErrorMatches, ".* no such file or directory") -} - -func (s *MySuite) TestAppendHCLToFile(c *C) { - // Setup - testFilename := "main.tf_TestAppendHCLToFile" - testPath := filepath.Join(s.testDir, testFilename) - _, err := os.Create(testPath) - c.Assert(err, IsNil) - hclFile := hclwrite.NewEmptyFile() - hclBody := hclFile.Body() - hclBody.SetAttributeValue("dummyAttributeName", cty.NumberIntVal(0)) - - // Success - err = appendHCLToFile(testPath, hclFile.Bytes()) - c.Assert(err, IsNil) + c.Assert(writeHclFile(fakePath, hclF), ErrorMatches, ".* no such file or directory") } func stringExistsInFile(str string, filename string) (bool, error) { @@ -373,7 +361,10 @@ func (s *MySuite) TestWriteOutputs(c *C) { // Success: Outputs added outputList := []modulereader.OutputInfo{ {Name: "output1"}, - {Name: "output2"}, + { + Name: "output2", + Sensitive: true, + }, } moduleWithOutputs := config.Module{Outputs: outputList, ID: "testMod"} testModules = []config.Module{moduleWithOutputs} @@ -389,7 +380,7 @@ func (s *MySuite) TestWriteOutputs(c *C) { // Failure: Bad path err = writeOutputs(testModules, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating outputs.tf file: .*") + c.Assert(err, ErrorMatches, ".*outputs.tf.*") } @@ -410,7 +401,7 @@ func (s *MySuite) TestWriteVariables(c *C) { // Failure: Bad path err = writeVariables(testVars, noIntergroupVars, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating variables.tf file: .*") + c.Assert(err, NotNil) // Success, common vars testVars["deployment_name"] = cty.StringVal("test_deployment") @@ -445,8 +436,7 @@ func (s *MySuite) TestWriteProviders(c *C) { c.Assert(exists, Equals, false) // Failure: Bad Path - err = writeProviders(testVars, "not/a/real/path") - c.Assert(err, ErrorMatches, "error creating providers.tf file: .*") + c.Assert(writeProviders(testVars, "not/a/real/path"), NotNil) // Success: All vars testVars["project_id"] = cty.StringVal("test_project") @@ -520,7 +510,7 @@ func (s *MySuite) TestWritePackerAutoVars(c *C) { // fail writing to a bad path badDestPath := "not/a/real/path" err := writePackerAutovars(vars.Items(), badDestPath) - expErr := fmt.Sprintf("error creating variables file %s:.*", packerAutoVarFilename) + expErr := fmt.Sprintf(".*%s.*", packerAutoVarFilename) c.Assert(err, ErrorMatches, expErr) // success diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go deleted file mode 100644 index 6c1992034b..0000000000 --- a/pkg/modulewriter/tfversions.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package modulewriter - -const tfversions string = ` -terraform { - required_version = ">= 1.2" - - required_providers { - google = { - source = "hashicorp/google" - version = "~> 4.84.0" - } - google-beta = { - source = "hashicorp/google-beta" - version = "~> 4.84.0" - } - } -} -` diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index 60179f7223..dc1a9975a2 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -42,26 +42,17 @@ const ( // TFWriter writes terraform to the blueprint folder type TFWriter struct{} -// createBaseFile creates a baseline file for all terraform/hcl including a -// license and any other boilerplate -func createBaseFile(path string) error { - baseFile, err := os.Create(path) +func writeHclFile(path string, hclFile *hclwrite.File) error { + f, err := os.Create(path) if err != nil { - return err + return fmt.Errorf("error writing %q: %v", path, err) } - defer baseFile.Close() - _, err = baseFile.WriteString(license) - return err -} - -func appendHCLToFile(path string, hclBytes []byte) error { - file, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0644) - if err != nil { - return err + defer f.Close() + if _, err := f.WriteString(license); err != nil { + return fmt.Errorf("error writing %q: %v", path, err) } - defer file.Close() - if _, err = file.Write(hclBytes); err != nil { - return err + if _, err := f.Write(hclwrite.Format(hclFile.Bytes())); err != nil { + return fmt.Errorf("error writing %q: %v", path, err) } return nil } @@ -101,16 +92,7 @@ func writeOutputs( if len(outputs) == 0 { return nil } - hclBytes := hclFile.Bytes() - outputsPath := filepath.Join(dst, "outputs.tf") - if err := createBaseFile(outputsPath); err != nil { - return fmt.Errorf("error creating outputs.tf file: %v", err) - } - err := appendHCLToFile(outputsPath, hclBytes) - if err != nil { - return fmt.Errorf("error writing HCL to outputs.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "outputs.tf"), hclFile) } func writeTfvars(vars map[string]cty.Value, dst string) error { @@ -135,12 +117,6 @@ func getTypeTokens(ty cty.Type) hclwrite.Tokens { } func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, dst string) error { - // Create file - variablesPath := filepath.Join(dst, "variables.tf") - if err := createBaseFile(variablesPath); err != nil { - return fmt.Errorf("error creating variables.tf file: %v", err) - } - var inputs []modulereader.VarInfo for k, v := range vars { inputs = append(inputs, modulereader.VarInfo{ @@ -165,11 +141,7 @@ func writeVariables(vars map[string]cty.Value, extraVars []modulereader.VarInfo, blockBody.SetAttributeRaw("type", getTypeTokens(k.Type)) } - // Write file - if err := appendHCLToFile(variablesPath, hclFile.Bytes()); err != nil { - return fmt.Errorf("error writing HCL to variables.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "variables.tf"), hclFile) } func writeMain( @@ -177,13 +149,6 @@ func writeMain( tfBackend config.TerraformBackend, dst string, ) error { - // Create file - mainPath := filepath.Join(dst, "main.tf") - if err := createBaseFile(mainPath); err != nil { - return fmt.Errorf("error creating main.tf file: %v", err) - } - - // Create HCL Body hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() @@ -218,25 +183,13 @@ func writeMain( moduleBody.SetAttributeRaw(setting, config.TokensForValue(value)) } } - // Write file - hclBytes := hclFile.Bytes() - hclBytes = hclwrite.Format(hclBytes) - if err := appendHCLToFile(mainPath, hclBytes); err != nil { - return fmt.Errorf("error writing HCL to main.tf file: %v", err) - } - return nil + + return writeHclFile(filepath.Join(dst, "main.tf"), hclFile) } var simpleTokens = hclwrite.TokensForIdentifier func writeProviders(vars map[string]cty.Value, dst string) error { - // Create file - providersPath := filepath.Join(dst, "providers.tf") - if err := createBaseFile(providersPath); err != nil { - return fmt.Errorf("error creating providers.tf file: %v", err) - } - - // Create HCL Body hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() @@ -254,26 +207,36 @@ func writeProviders(vars map[string]cty.Value, dst string) error { provBody.SetAttributeRaw("region", simpleTokens("var.region")) } } - - // Write file - hclBytes := hclFile.Bytes() - if err := appendHCLToFile(providersPath, hclBytes); err != nil { - return fmt.Errorf("error writing HCL to providers.tf file: %v", err) - } - return nil + return writeHclFile(filepath.Join(dst, "providers.tf"), hclFile) } func writeVersions(dst string) error { - // Create file - versionsPath := filepath.Join(dst, "versions.tf") - if err := createBaseFile(versionsPath); err != nil { - return fmt.Errorf("error creating versions.tf file: %v", err) + f := hclwrite.NewEmptyFile() + body := f.Body() + body.AppendNewline() + tfb := body.AppendNewBlock("terraform", []string{}).Body() + tfb.SetAttributeValue("required_version", cty.StringVal(">= 1.2")) + tfb.AppendNewline() + + type provider struct { + alias string + source string + version string } - // Write hard-coded version information - if err := appendHCLToFile(versionsPath, []byte(tfversions)); err != nil { - return fmt.Errorf("error writing HCL to versions.tf file: %v", err) + providers := []provider{ + {"google", "hashicorp/google", "~> 4.84.0"}, + {"google-beta", "hashicorp/google-beta", "~> 4.84.0"}, } - return nil + + pb := tfb.AppendNewBlock("required_providers", []string{}).Body() + + for _, p := range providers { + pb.SetAttributeValue(p.alias, cty.ObjectVal(map[string]cty.Value{ + "source": cty.StringVal(p.source), + "version": cty.StringVal(p.version), + })) + } + return writeHclFile(filepath.Join(dst, "versions.tf"), f) } func writeTerraformInstructions(w io.Writer, grpPath string, n config.GroupName, printExportOutputs bool, printImportInputs bool) { From f1697649edeee3a5eb742f661ccbe36d24a61654 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jan 2024 16:44:10 -0800 Subject: [PATCH 088/151] Add `--force` flag to `ghpc create` (#2162) --- cmd/create.go | 29 +++++++++++++----- cmd/create_test.go | 75 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index c76c640a2a..299a777926 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -52,6 +52,10 @@ func init() { "Note: Terraform state IS preserved. \n"+ "Note: Terraform workspaces are NOT supported (behavior undefined). \n"+ "Note: Packer is NOT supported.") + createCmd.Flags().BoolVar(&forceOverwrite, "force", false, + "Forces overwrite of existing deployment directory. \n"+ + "If set, --overwrite-deployment is implied. \n"+ + "No validation is performed on the existing deployment directory.") rootCmd.AddCommand(createCmd) } @@ -62,6 +66,7 @@ var ( cliBEConfigVars []string overwriteDeployment bool + forceOverwrite bool validationLevel string validationLevelDesc = "Set validation level to one of (\"ERROR\", \"WARNING\", \"IGNORE\")" validatorsToSkip []string @@ -80,7 +85,7 @@ var ( func runCreateCmd(cmd *cobra.Command, args []string) { dc := expandOrDie(args[0]) deplDir := filepath.Join(outputDir, dc.Config.DeploymentName()) - checkErr(checkOverwriteAllowed(deplDir, dc.Config, overwriteDeployment)) + checkErr(checkOverwriteAllowed(deplDir, dc.Config, overwriteDeployment, forceOverwrite)) checkErr(modulewriter.WriteDeployment(dc, deplDir)) logging.Info("To deploy your infrastructure please run:") @@ -235,29 +240,37 @@ func filterYaml(cmd *cobra.Command, args []string, toComplete string) ([]string, return []string{"yaml", "yml"}, cobra.ShellCompDirectiveFilterFileExt } +func forceErr(err error) error { + return config.HintError{ + Err: err, + Hint: "Use `--force` to overwrite the deployment anyway. Proceed at your own risk."} +} + // Determines if overwrite is allowed -func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag bool) error { - if _, err := os.Stat(depDir); os.IsNotExist(err) { +func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag bool, forceFlag bool) error { + if _, err := os.Stat(depDir); os.IsNotExist(err) || forceFlag { return nil // all good, no previous deployment } if _, err := os.Stat(modulewriter.HiddenGhpcDir(depDir)); os.IsNotExist(err) { // hidden ghpc dir does not exist - return fmt.Errorf("folder %q already exists, and it is not a valid GHPC deployment folder", depDir) + return forceErr(fmt.Errorf("folder %q already exists, and it is not a valid GHPC deployment folder", depDir)) } // try to get previous deployment expPath := filepath.Join(modulewriter.ArtifactsDir(depDir), modulewriter.ExpandedBlueprintName) if _, err := os.Stat(expPath); os.IsNotExist(err) { - return fmt.Errorf("expanded blueprint file %q is missing, this could be a result of changing GHPC version between consecutive deployments", expPath) + return forceErr(fmt.Errorf("expanded blueprint file %q is missing, this could be a result of changing GHPC version between consecutive deployments", expPath)) } prev, _, err := config.NewDeploymentConfig(expPath) if err != nil { - return err + return forceErr(err) } if prev.Config.GhpcVersion != bp.GhpcVersion { - logging.Info("WARNING: ghpc_version has changed from %q to %q, using different versions of GHPC to update a live deployment is not officially supported. Proceed at your own risk", prev.Config.GhpcVersion, bp.GhpcVersion) + return forceErr(fmt.Errorf( + "ghpc_version has changed from %q to %q, using different versions of GHPC to update a live deployment is not officially supported", + prev.Config.GhpcVersion, bp.GhpcVersion)) } if !overwriteFlag { @@ -271,7 +284,7 @@ func checkOverwriteAllowed(depDir string, bp config.Blueprint, overwriteFlag boo for _, g := range prev.Config.DeploymentGroups { if !newGroups[g.Name] { - return fmt.Errorf("you are attempting to remove a deployment group %q, which is not supported", g.Name) + return forceErr(fmt.Errorf("you are attempting to remove a deployment group %q, which is not supported", g.Name)) } } diff --git a/cmd/create_test.go b/cmd/create_test.go index b3f11714ee..25d6b6d11e 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -146,16 +146,20 @@ func (s *MySuite) TestIsOverwriteAllowed_Absent(c *C) { depDir := filepath.Join(testDir, "casper") bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), IsNil) - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_NotGHPC(c *C) { depDir := c.MkDir() // empty deployment folder considered malformed bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), ErrorMatches, ".* not a valid GHPC deployment folder") - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), ErrorMatches, ".* not a valid GHPC deployment folder") + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* not a valid GHPC deployment folder.*") + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* not a valid GHPC deployment folder.*") + + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_NoExpanded(c *C) { @@ -165,8 +169,12 @@ func (s *MySuite) TestIsOverwriteAllowed_NoExpanded(c *C) { } bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), ErrorMatches, ".* changing GHPC version.*") - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), ErrorMatches, ".* changing GHPC version.*") + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* changing GHPC version.*") + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), + ErrorMatches, ".* changing GHPC version.*") + + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_Malformed(c *C) { @@ -180,36 +188,57 @@ func (s *MySuite) TestIsOverwriteAllowed_Malformed(c *C) { } bp := config.Blueprint{} - c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/), NotNil) - c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/), NotNil) + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, false /*forceOverwrite*/), NotNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, false /*forceOverwrite*/), NotNil) + // force + c.Check(checkOverwriteAllowed(depDir, bp, false /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) + c.Check(checkOverwriteAllowed(depDir, bp, true /*overwriteFlag*/, true /*forceOverwrite*/), IsNil) } func (s *MySuite) TestIsOverwriteAllowed_Present(c *C) { - depDir := c.MkDir() - artDir := modulewriter.ArtifactsDir(depDir) + p := c.MkDir() + artDir := modulewriter.ArtifactsDir(p) if err := os.MkdirAll(artDir, 0755); err != nil { c.Fatal(err) } prev := config.DeploymentConfig{ Config: config.Blueprint{ - GhpcVersion: "TaleOdBygoneYears", + GhpcVersion: "TaleOfBygoneYears", DeploymentGroups: []config.DeploymentGroup{ {Name: "isildur"}}}} if err := prev.ExportBlueprint(filepath.Join(artDir, "expanded_blueprint.yaml")); err != nil { c.Fatal(err) } + noW, yesW, noForce, yesForce := false, true, false, true + + { // Superset + bp := config.Blueprint{ + GhpcVersion: "TaleOfBygoneYears", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "isildur"}, + {Name: "elendil"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, ".* already exists, use -w to overwrite") + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), IsNil) + } - super := config.Blueprint{ - DeploymentGroups: []config.DeploymentGroup{ - {Name: "isildur"}, - {Name: "elendil"}}} - c.Check(checkOverwriteAllowed(depDir, super, false /*overwriteFlag*/), ErrorMatches, ".* already exists, use -w to overwrite") - c.Check(checkOverwriteAllowed(depDir, super, true /*overwriteFlag*/), IsNil) - - sub := config.Blueprint{ - DeploymentGroups: []config.DeploymentGroup{ - {Name: "aragorn"}}} - c.Check(checkOverwriteAllowed(depDir, sub, false /*overwriteFlag*/), ErrorMatches, `.* already exists, use -w to overwrite`) - c.Check(checkOverwriteAllowed(depDir, sub, true /*overwriteFlag*/), ErrorMatches, `.*remove a deployment group "isildur".*`) + { // Version mismatch + bp := config.Blueprint{ + GhpcVersion: "TheAlloyOfLaw", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "isildur"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, ".*ghpc_version has changed.*") + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), ErrorMatches, ".*ghpc_version has changed.*") + c.Check(checkOverwriteAllowed(p, bp, noW, yesForce), IsNil) + } + + { // Subset + bp := config.Blueprint{ + GhpcVersion: "TaleOfBygoneYears", + DeploymentGroups: []config.DeploymentGroup{ + {Name: "aragorn"}}} + c.Check(checkOverwriteAllowed(p, bp, noW, noForce), ErrorMatches, `.* already exists, use -w to overwrite`) + c.Check(checkOverwriteAllowed(p, bp, yesW, noForce), ErrorMatches, `.*remove a deployment group "isildur".*`) + c.Check(checkOverwriteAllowed(p, bp, noW, yesForce), IsNil) + } } From fe99ac8f1d4496d1c81f23b19dc7d541a2eeff6e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jan 2024 17:00:30 -0800 Subject: [PATCH 089/151] Improve error logging for expressions parsing (#2078) * Show snippet with ponter to a column; ```sh Error: :0,21-22: Invalid character; This character is not used within the language., and 1 other diagnostic(s) 34: content: | Error: Invalid character; This character is not used within the language. echo "Hello $(vars.project_id from $(vars.region)" ^ 33: content: | ``` * Prevent line-breaks within expressions. This constraint existed before, but was accidentaly relaxed by recent PR. --- pkg/config/expression.go | 40 +++++++++++++++++++++++++++++++++-- pkg/config/expression_test.go | 3 +++ pkg/config/yaml.go | 2 +- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pkg/config/expression.go b/pkg/config/expression.go index bf74670551..d62539d7ac 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -384,7 +384,8 @@ type pToken struct { e Expression } -func tokenizeBpString(s string) ([]pToken, error) { +func tokenizeBpLine(s string) ([]pToken, error) { + line := s // copy toks := []pToken{} var exp Expression var err error @@ -405,9 +406,10 @@ func tokenizeBpString(s string) ([]pToken, error) { if bs%2 == 1 { // escaped $( toks = append(toks, pToken{s: "$("}) // add "$(" } else { // found beginning of expression + offset := len(line) - len(s) exp, s, err = greedyParseHcl(s) // parse after "$(" if err != nil { - return nil, err + return nil, prepareParseHclErr(err, line, offset) } toks = append(toks, pToken{e: exp}) // add expression } @@ -415,6 +417,40 @@ func tokenizeBpString(s string) ([]pToken, error) { return toks, nil } +// One can't translate HCL diagnostics position to the global YAML position, +// due to lack of information about YAML string-style (e.g. double quoted, plain, folded etc), +// therefore start position of the string in YAML document and indentation. +// Render error in a scope of a single line of the string instead. +func prepareParseHclErr(err error, line string, offset int) error { + var col int + if diag, is := err.(hcl.Diagnostics); is { + derr, _ := diag.Errs()[0].(*hcl.Diagnostic) + col = offset + derr.Subject.Start.Column + err = fmt.Errorf("%s; %s", derr.Summary, derr.Detail) + } else { + col = offset // point at the beginning of expression + } + return fmt.Errorf("%s\n %s\n %s^", err, line, strings.Repeat(" ", col)) +} + +func tokenizeBpString(s string) ([]pToken, error) { + toks := []pToken{} + + // can't use `bufio.NewScanner` as it doesn't preserve trailing empty lines + lines := regexp.MustCompile("\r?\n").Split(s, -1) + for _, line := range lines { + if len(toks) > 0 { + toks = append(toks, pToken{s: "\n"}) + } + ltoks, err := tokenizeBpLine(line) + if err != nil { + return nil, err + } + toks = append(toks, ltoks...) + } + return toks, nil +} + func compactTokens(toks []pToken) []pToken { res := []pToken{} for _, t := range toks { diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index d8af75c209..39b0219b68 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -106,6 +106,9 @@ was here`, `"5gold\nwas here"`, false}, {`#!/bin/bash echo "Hello $(vars.project_id) from $(vars.region)"`, `"#!/bin/bash\necho \"Hello ${var.project_id} from ${var.region}\""`, false}, + {`#!/bin/bash +echo "Hello $(vars.project_id)" +`, `"#!/bin/bash\necho \"Hello ${var.project_id}\"\n"`, false}, {"", `""`, false}, {`$(try(vars.this) + one(vars.time))`, "try(var.this)+one(var.time)", false}, diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index 264cc6daaf..05144fac42 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -388,7 +388,7 @@ func parseYamlV3Error(err error) error { // If no position can be extracted, returns error without position. // Else returns PosError{Pos{Line: line_number}, error_message}. func parseYamlV3ErrorString(s string) error { - match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?(.*)$`).FindStringSubmatch(s) + match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?((.|\n)*)$`).FindStringSubmatch(s) if match == nil { return errors.New(s) } From 31ba8e1bbf30daa2fb4b14392c0e6470725e75df Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 25 Jan 2024 11:40:27 -0600 Subject: [PATCH 090/151] Remove quantum circuit simulator example --- .../examples/quantum-circuit-simulator.yaml | 145 ------------------ examples/README.md | 22 --- .../daily-tests/builds/quantum-circuit.yaml | 55 ------- tools/cloud-build/daily-tests/tests/qsim.yml | 24 --- 4 files changed, 246 deletions(-) delete mode 100644 community/examples/quantum-circuit-simulator.yaml delete mode 100644 tools/cloud-build/daily-tests/builds/quantum-circuit.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/qsim.yml diff --git a/community/examples/quantum-circuit-simulator.yaml b/community/examples/quantum-circuit-simulator.yaml deleted file mode 100644 index 2876ae5b16..0000000000 --- a/community/examples/quantum-circuit-simulator.yaml +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -blueprint_name: quantum-circuit - -# Please review https://cloud.google.com/compute/docs/regions-zones -# for availability of A2 machine types -vars: - project_id: ## Set project id here - deployment_name: qsim-demo - region: us-central1 - zone: us-central1-f - -# Documentation for each of the modules used below can be found at -# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/vpc - - - id: quantum-simulator-setup - source: modules/scripts/startup-script - settings: - runners: - - type: shell - destination: install-qsim.sh - content: | - #!/bin/bash - # This script implements https://quantumai.google/qsim/tutorials/gcp_gpu - # Disable any user interactive prompt during upgrade script. - export DEBIAN_FRONTEND=noninteractive - set -e -o pipefail - curl -O https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py - python3 install_gpu_driver.py - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-Linux-x86_64.sh - bash Miniconda3-py39_4.12.0-Linux-x86_64.sh -b -p /opt/conda - source /opt/conda/bin/activate base - conda init --system - conda config --system --set auto_activate_base False - # following channel ordering is important! use strict_priority! - # cuquantum comes from cuquantum label in nvidia channel - # libcutensor comes from main (default) label in nvidia channel - # cuda and all toolkit comes from cuda-11.5.2 label in nvidia channel - # everything else comes from conda-forge - conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.5.2 - conda config --system --add channels nvidia/label/cuquantum-22.07.1 - conda update -n base conda --yes - conda create -n qsim python=3.9 --yes - conda install -n qsim cuda cuquantum make cmake cxx-compiler=1.5.1 --yes - echo "cuda ==11.5.*" > /opt/conda/envs/qsim/conda-meta/pinned - conda clean -p -t --yes - conda activate qsim - pip install pybind11 cirq - git clone -b v0.18.0 https://github.com/quantumlib/qsim.git /opt/qsim - cd /opt/qsim - export CUQUANTUM_ROOT=/opt/conda/envs/qsim - make - pip install . - - type: data - destination: /var/tmp/qsim-example.py - content: | - import sys - import time - import cirq, cirq_google - import qsimcirq - - def sim(width: int, height: int, reps: int, use_gpu: bool, gpu_mode: int): - rqc_fn = cirq.experiments.random_rotations_between_grid_interaction_layers_circuit - qvm_fn = cirq_google.engine.create_default_noisy_quantum_virtual_machine - - qubits = cirq.GridQubit.rect(width, height, 3, 2) - circuit = rqc_fn(qubits, depth=10, seed=0) + cirq.measure(*qubits, key="final_state") - - processor_id = "weber" - qsim_options = qsimcirq.QSimOptions(use_gpu=use_gpu, gpu_mode=gpu_mode) - # we do not recommend using seed=0 in production usage; in this - # example it helps compare performance between runs - qvm = qvm_fn(processor_id, qsimcirq.QSimSimulator, seed=0, qsim_options=qsim_options) - - start = time.time() - results = qvm.get_sampler(processor_id).run(circuit, repetitions=reps) - print(results) - print(f"elapsed: {time.time() - start:.03f}s") - - - if __name__ == "__main__": - width, height, reps = 5, 5, 10 - - print("This series of simulations should last approximately 1 minute on an A2 series VM\n") - print("Running on CPU:") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=False, gpu_mode=0) - print("\nRunning on GPU (CUDA):") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=True, gpu_mode=0) - print("\nRunning on GPU (CUDA + cuQuantum):") - sys.stdout.flush() - sim(width=width, height=height, reps=reps, use_gpu=True, gpu_mode=1) - - type: shell - destination: run-qsim.sh - content: | - #!/bin/bash -i - # The -i above (for interactive) is required so that conda command will be accessible. - # this script demonstrates how to run the qsim example application and - # also "warms up" the GPU to give reliable performance metrics - conda activate qsim - python /var/tmp/qsim-example.py - - - id: qsimvm - source: modules/compute/vm-instance - use: - - network1 - - quantum-simulator-setup - settings: - machine_type: n1-standard-32 - guest_accelerator: - - type: nvidia-tesla-t4 - count: 1 - instance_image: - project: ubuntu-os-cloud - family: ubuntu-2004-lts - - - id: wait - source: community/modules/scripts/wait-for-startup - settings: - instance_name: $(qsimvm.name[0]) - timeout: 2400 diff --git a/examples/README.md b/examples/README.md index 76368a0f05..be8eacbed6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -25,7 +25,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] - * [quantum-circuit-simulator.yaml](#quantum-circuit-simulatoryaml-) ![community-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] * [omnia-cluster.yaml](#omnia-clusteryaml--) ![community-badge] ![experimental-badge] @@ -666,27 +665,6 @@ examples][amd-examples-readme]. [AOCC]: https://developer.amd.com/amd-aocc/ [amd-examples-readme]: ../community/examples/AMD/README.md -### [quantum-circuit-simulator.yaml] ![community-badge] - -This blueprint provisions a [N1 series VM with NVIDIA T4 GPU accelerator][t4] -and compiles [qsim], a [Google Quantum AI][gqai]-developed tool that simulates -quantum circuits using CPUs and GPUs. The installation of qsim, the [CUDA -Toolkit][cudatk], and the [cuQuantum SDK][cqsdk] is fully automated but takes a -significant time (approx. 20 minutes). Once complete, a qsim example can be run -by connecting to the VM by SSH and running - -```shell -conda activate qsim -python /var/tmp/qsim-example.py -``` - -[gqai]: https://quantumai.google/ -[quantum-circuit-simulator.yaml]: ../community/examples/quantum-circuit-simulator.yaml -[t4]: https://cloud.google.com/compute/docs/gpus#nvidia_t4_gpus -[qsim]: https://quantumai.google/qsim -[cqsdk]: https://developer.nvidia.com/cuquantum-sdk -[cudatk]: https://developer.nvidia.com/cuda-toolkit - ### [client-google-cloud-storage.yaml] ![community-badge] ![experimental-badge] [client-google-cloud-storage.yaml]: ../community/examples/client-google-cloud-storage.yaml diff --git a/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml b/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml deleted file mode 100644 index 2d72aee381..0000000000 --- a/tools/cloud-build/daily-tests/builds/quantum-circuit.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -# test quantum circuit simulator example (startup-script runs example code after -# compiling libraries) -- id: quantum-circuit - waitFor: ["fetch_builder", "build_ghpc"] - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/qsim.yml" diff --git a/tools/cloud-build/daily-tests/tests/qsim.yml b/tools/cloud-build/daily-tests/tests/qsim.yml deleted file mode 100644 index e6eca65caa..0000000000 --- a/tools/cloud-build/daily-tests/tests/qsim.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: quantum-circuit -deployment_name: "qsim-{{ build }}" -zone: us-central1-f -workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/quantum-circuit-simulator.yaml" -network: "{{ deployment_name }}-net" -remote_node: "{{ deployment_name }}-0" -post_deploy_tests: [] From 615ac7a7b178178a399b46bd80c3781a397213c4 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 17 Jan 2024 21:25:11 +0000 Subject: [PATCH 091/151] Update hpc-slurm-legacy-sharedvpc example and references to use Slurm V6 --- .../examples/hpc-slurm-legacy-sharedvpc.yaml | 62 ++++++++++--------- examples/README.md | 13 ++-- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/community/examples/hpc-slurm-legacy-sharedvpc.yaml b/community/examples/hpc-slurm-legacy-sharedvpc.yaml index 9db8f115dd..d44d333140 100644 --- a/community/examples/hpc-slurm-legacy-sharedvpc.yaml +++ b/community/examples/hpc-slurm-legacy-sharedvpc.yaml @@ -55,46 +55,50 @@ deployment_groups: local_mount: /home connect_mode: PRIVATE_SERVICE_ACCESS - # This debug_partition will work out of the box without requesting additional GCP quota. + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false # the default is: true + - id: debug_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset, homefs] settings: partition_name: debug - max_node_count: 4 - enable_placement: false - exclusive: false - machine_type: n2-standard-2 + exclusive: false # allows nodes to stay up after jobs are done + is_default: true - # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - - id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: - - network1 - - homefs + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] settings: - partition_name: compute - max_node_count: 20 + node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled - - id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - debug_partition # debug partition will be default as it is listed first - - compute_partition + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset, homefs] settings: - login_node_count: 1 - shared_vpc_host_project: $(vars.host_project_id) + partition_name: compute - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 + - debug_partition + - compute_partition + - slurm_login - homefs - - slurm_controller settings: - shared_vpc_host_project: $(vars.host_project_id) + disable_controller_public_ips: false diff --git a/examples/README.md b/examples/README.md index be8eacbed6..0038bad968 100644 --- a/examples/README.md +++ b/examples/README.md @@ -25,6 +25,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] + * [hpc-slurm-legacy-sharedvpc.yaml](#hpc-slurm-legacy-sharedvpcyaml-) ![community-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] * [omnia-cluster.yaml](#omnia-clusteryaml--) ![community-badge] ![experimental-badge] @@ -41,7 +42,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm-chromedesktop.yaml](#hpc-slurm-chromedesktopyaml--) ![community-badge] ![experimental-badge] * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge] * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] - * [hpc-slurm-legacy-sharedvpc.yaml](#hpc-slurm-legacy-sharedvpcyaml--) ![community-badge] ![deprecated-badge] * [Blueprint Schema](#blueprint-schema) * [Writing an HPC Blueprint](#writing-an-hpc-blueprint) * [Blueprint Boilerplate](#blueprint-boilerplate) @@ -977,12 +977,17 @@ See [README](../community/examples/flux-framework/README.md) [flux-cluster.yaml]: ../community/examples/flux-framework/flux-cluster.yaml -### [hpc-slurm-legacy-sharedvpc.yaml] ![community-badge] ![deprecated-badge] +### [hpc-slurm-legacy-sharedvpc.yaml] ![community-badge] This blueprint demonstrates the use of the Slurm and Filestore modules in -the service project of an existing Shared VPC. Before attempting to deploy the +the service project of an existing Shared VPC. Before attempting to deploy the blueprint, one must first complete [initial setup for provisioning Filestore in -a Shared VPC service project][fs-shared-vpc]. +a Shared VPC service project][fs-shared-vpc]. Depending on how the shared VPC +was created one may have to perform a few additional manual steps to configure +the VPC. One may need to create firewall rules allowing SSH to be able to access +the controller and login nodes. Also since this blueprint doesn't use external +IPs for compute nodes, one must needs to [set up cloud nat][cloudnat] and +[set up iap][iap]. [hpc-slurm-legacy-sharedvpc.yaml]: ../community/examples/hpc-slurm-legacy-sharedvpc.yaml [fs-shared-vpc]: https://cloud.google.com/filestore/docs/shared-vpc From 967511c1ba401100c2cb6f0be3b6d91fb80940e5 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 25 Jan 2024 13:10:11 -0800 Subject: [PATCH 092/151] Bump `cmd` test coverage (#2165) --- cmd/create.go | 12 +++--------- cmd/create_test.go | 7 +++++-- pkg/config/config.go | 3 +-- pkg/config/config_test.go | 10 +++++----- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/cmd/create.go b/cmd/create.go index 299a777926..ccf5dc3124 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -115,7 +115,7 @@ func expandOrDie(path string) config.DeploymentConfig { logging.Fatal("Failed to set the backend config at CLI: %v", err) } checkErr(setValidationLevel(&dc.Config, validationLevel)) - checkErr(skipValidators(&dc)) + skipValidators(&dc) if dc.Config.GhpcVersion != "" { logging.Info("ghpc_version setting is ignored.") @@ -221,16 +221,10 @@ func setValidationLevel(bp *config.Blueprint, s string) error { return nil } -func skipValidators(dc *config.DeploymentConfig) error { - if validatorsToSkip == nil { - return nil - } +func skipValidators(dc *config.DeploymentConfig) { for _, v := range validatorsToSkip { - if err := dc.SkipValidator(v); err != nil { - return err - } + dc.SkipValidator(v) } - return nil } func filterYaml(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { diff --git a/cmd/create_test.go b/cmd/create_test.go index 25d6b6d11e..891c1680df 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -74,9 +74,12 @@ func (s *MySuite) TestSetCLIVariables(c *C) { // Failure: Variable without '=' bp = config.Blueprint{} inv := []string{"project_idcli_test_project_id"} + c.Check(setCLIVariables(&bp, inv), ErrorMatches, "invalid format: .*") - c.Assert(setCLIVariables(&bp, inv), ErrorMatches, "invalid format: .*") - c.Check(bp.Vars, DeepEquals, config.Dict{}) + // Failure: Unmarshalable value + bp = config.Blueprint{} + inv = []string{"pyrite={gold"} + c.Check(setCLIVariables(&bp, inv), ErrorMatches, ".*unable to convert.*pyrite.*gold.*") } func (s *MySuite) TestSetBackendConfig(c *C) { diff --git a/pkg/config/config.go b/pkg/config/config.go index 0b733cc015..6e29e579ea 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -478,7 +478,7 @@ func validateBlueprint(bp Blueprint) error { // SkipValidator marks validator(s) as skipped, // if no validator is present, adds one, marked as skipped. -func (dc *DeploymentConfig) SkipValidator(name string) error { +func (dc *DeploymentConfig) SkipValidator(name string) { if dc.Config.Validators == nil { dc.Config.Validators = []Validator{} } @@ -492,7 +492,6 @@ func (dc *DeploymentConfig) SkipValidator(name string) error { if !skipped { dc.Config.Validators = append(dc.Config.Validators, Validator{Validator: name, Skip: true}) } - return nil } // InputValueError signifies a problem with the blueprint name. diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index a04ff26a24..9cc729f9a3 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -730,14 +730,14 @@ func (s *zeroSuite) TestCheckBackends(c *C) { func (s *zeroSuite) TestSkipValidator(c *C) { { dc := DeploymentConfig{Config: Blueprint{Validators: nil}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}}) } { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -746,7 +746,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -755,7 +755,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -765,7 +765,7 @@ func (s *zeroSuite) TestSkipValidator(c *C) { {Validator: "zebra"}, {Validator: "pony"}, {Validator: "zebra"}}}} - c.Check(dc.SkipValidator("zebra"), IsNil) + dc.SkipValidator("zebra") c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}, {Validator: "pony"}, From 270d5f3e7fea29ac9d62d3a3aa5dc8dd2e75b39d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 25 Jan 2024 20:19:34 -0800 Subject: [PATCH 093/151] Update Slurm image 6.1 -> 6.3 (#2169) --- .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../source_image_logic.tf | 12 ++++++------ .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../source_image_logic.tf | 12 ++++++------ .../variables_controller_instance.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-login/README.md | 2 +- .../schedmd-slurm-gcp-v6-login/source_image_logic.tf | 12 ++++++------ .../schedmd-slurm-gcp-v6-login/variables.tf | 2 +- 11 files changed, 26 insertions(+), 26 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index ac5dd7a4b3..b1353aee7e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -60,7 +60,7 @@ No resources. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-3-tf- | `string` | `null` | no | | [name](#input\_name) | Name of the nodeset tpu. | `string` | `"ghpc"` | no | | [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | | [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index a23e966c56..323ed8f655 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -96,7 +96,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-3-tf-" type = string default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index f3fc27d3c1..d7f0ee21b0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -170,7 +170,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 9f0313696d..3a88644b08 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -76,7 +76,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 33b371f149..b18cbffb98 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -185,7 +185,7 @@ limitations under the License. | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 9a3f937557..df013e93c8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -244,7 +244,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index ceca9d9365..682a4c2a68 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -97,7 +97,7 @@ No modules. | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-3-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index 8759a268cc..532749e7ba 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-1-debian-11", - "slurm-gcp-6-1-hpc-rocky-linux-8", - "slurm-gcp-6-1-ubuntu-2004-lts", - "slurm-gcp-6-1-ubuntu-2204-lts-arm64", - "slurm-gcp-6-1-hpc-centos-7-k80", - "slurm-gcp-6-1-hpc-centos-7" + "slurm-gcp-6-3-debian-11", + "slurm-gcp-6-3-hpc-rocky-linux-8", + "slurm-gcp-6-3-ubuntu-2004-lts", + "slurm-gcp-6-3-ubuntu-2204-lts-arm64", + "slurm-gcp-6-3-hpc-centos-7-k80", + "slurm-gcp-6-3-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index f628791750..2d8b59e2b1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -276,7 +276,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-1-hpc-rocky-linux-8" + family = "slurm-gcp-6-3-hpc-rocky-linux-8" project = "schedmd-slurm-public" } From 1eb9c89fea65bafee1c54ca2d0296665eaef951a Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 26 Jan 2024 06:07:53 +0000 Subject: [PATCH 094/151] Add login node in the spack openfoam tutorial example --- docs/tutorials/openfoam/spack-openfoam.md | 23 +++++++++++---------- docs/tutorials/openfoam/spack-openfoam.yaml | 15 +++++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index 2fc4d51387..fa3a29fd73 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -84,6 +84,7 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster + * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -144,21 +145,21 @@ the final output from the above command: Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`spackopenf-controller`. If you don't +`spackopenf-controller` and `spackopenf-login-login-001`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the controller node +## Connecting to the login node -Once the startup script has completed, connect to the controller node. +Once the startup script has completed, connect to the login node. -Use the following command to ssh into the controller node from cloud shell: +Use the following command to ssh into the login node from cloud shell: ```bash -gcloud compute ssh spackopenf-controller --zone us-central1-c --project +gcloud compute ssh spackopenf-login-login-001 --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -182,15 +183,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `spackopenf-controller` +1. Click on the `SSH` button associated with the `spackopenf-login-login-001` instance. This will open a separate pop up window with a terminal into our newly - created Slurm controller VM. + created Slurm login VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm controller node.** + **The commands below should be run on the Slurm login node.** We will use the submission script (see line 122 of the blueprint) to submit a OpenFOAM job. @@ -238,7 +239,7 @@ about 5 minutes to run. Several files will have been generated in the `test_run/` folder you created. The `slurm-1.out` file has information on the run such as performance. You can -view this file by running the following command on the controller node: +view this file by running the following command on the login node: ```bash cat slurm-*.out @@ -259,9 +260,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the login node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index bd2ec7dc70..5b6635ff36 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -110,7 +110,7 @@ deployment_groups: spack install fi - - id: controller-setup + - id: login-setup source: modules/scripts/startup-script settings: runners: @@ -170,12 +170,21 @@ deployment_groups: partition_name: compute is_default: true + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login + machine_type: n2-standard-4 + disable_login_public_ips: false + - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition + - slurm_login settings: - controller_startup_script: $(controller-setup.startup_script) - controller_startup_scripts_timeout: 21600 + login_startup_script: $(login-setup.startup_script) + login_startup_scripts_timeout: 21600 disable_controller_public_ips: false From c21751b9c7e9af9dfe1fbbf1a1360fa89ead07c4 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 26 Jan 2024 10:57:37 -0800 Subject: [PATCH 095/151] Update Toolkit docs to point to GCP Slurm fork --- docs/vm-images.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/vm-images.md b/docs/vm-images.md index 5c95f9759e..8989c43953 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -315,8 +315,8 @@ These instructions apply to the following modules: [slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5 [slurm-gcp-packer]: https://github.com/SchedMD/slurm-gcp/tree/v5/packer -[slurm-gcp-images]: https://github.com/SchedMD/slurm-gcp/blob/v5/docs/images.md -[slurm-gcp-published-images]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/images.md#published-image-family +[slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md +[slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create [vm-instance]: ../modules/compute/vm-instance From 4c50295b0028317f5bf69d8e8a4c07394a64a871 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 26 Jan 2024 11:01:55 -0800 Subject: [PATCH 096/151] Fix: added new variables to ml-slurm integration test --- tools/cloud-build/daily-tests/tests/ml-slurm.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/cloud-build/daily-tests/tests/ml-slurm.yml b/tools/cloud-build/daily-tests/tests/ml-slurm.yml index d003e45429..459ee4a565 100644 --- a/tools/cloud-build/daily-tests/tests/ml-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ml-slurm.yml @@ -18,3 +18,5 @@ test_name: ml-slurm deployment_name: ml-slurm-{{ build }} workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/ml-slurm.yaml" +packer_group_name: packer +packer_module_id: custom-image From 0d2aed59258acc6b565528e24a91a449e673ca15 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Thu, 25 Jan 2024 22:04:48 +0000 Subject: [PATCH 097/151] Update slurm references --- .../schedmd-slurm-gcp-v6-controller/README.md | 22 +++++++++---------- .../controller.tf | 8 +++---- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 8 +++---- .../slurm_files.tf | 2 +- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 33b371f149..dd18f5851f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.1_20240118 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.1_20240118 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.1_20240118 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.1_20240118 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.1_20240118 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.1_20240118 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.1_20240118 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.1_20240118 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.1_20240118 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.2 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.2 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.2 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.2 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 5db82e74da..e1440f3c7c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.2" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -148,7 +148,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.2" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name @@ -164,7 +164,7 @@ module "cleanup_compute_nodes" { # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.2" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 594e96184c..0d51129631 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -59,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.2" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 0e0ab5a471..539205d08e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" for_each = local.nodeset_map project_id = var.project_id @@ -60,7 +60,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.2" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -79,7 +79,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.2" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -101,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.2" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index abadd4d454..f20d884937 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.1_20240118" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name From 071d11df6c86327e0372c89d92f1b687f7e55743 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Fri, 26 Jan 2024 20:57:50 +0000 Subject: [PATCH 098/151] Update CloudSQL blueprint to v6 --- .../slurm-cloudsql-federation/metadata.yaml | 1 + .../controller.tf | 2 +- .../hpc-cluster-simple-nfs-sql.yaml | 26 +++++++++++-------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/community/modules/database/slurm-cloudsql-federation/metadata.yaml b/community/modules/database/slurm-cloudsql-federation/metadata.yaml index 0db2ea4503..fc0cae0859 100644 --- a/community/modules/database/slurm-cloudsql-federation/metadata.yaml +++ b/community/modules/database/slurm-cloudsql-federation/metadata.yaml @@ -18,3 +18,4 @@ spec: services: - bigqueryconnection.googleapis.com - sqladmin.googleapis.com + - servicenetworking.googleapis.com diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index e1440f3c7c..55337b4ec4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -142,7 +142,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { secret_id = google_secret_manager_secret.cloudsql[0].id role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${local.service_account[0].email}" + member = "serviceAccount:${local.service_account.email}" } diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml index 9c3015c9a8..a6d7001fd5 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml @@ -29,45 +29,49 @@ deployment_groups: source: modules/network/vpc - id: homefs - source: ./community/modules/file-system/nfs-server + source: community/modules/file-system/nfs-server use: [network1] settings: labels: ghpc_role: storage-home - id: slurm-sql - source: ./community/modules/database/slurm-cloudsql-federation + source: community/modules/database/slurm-cloudsql-federation use: [network1] settings: sql_instance_name: slurm-sql8 tier: "db-f1-micro" + - id: compute-nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network1] + settings: + node_count_dynamic_max: 20 + machine_type: c2-standard-4 + - id: compute-partition - source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: - homefs - - network1 + - compute-nodeset settings: partition_name: compute - max_node_count: 20 - machine_type: c2-standard-4 - id: slurm-controller - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - homefs - compute-partition + - slurm-login - slurm-sql - network1 settings: - login_node_count: 1 - disable_compute_public_ips: true disable_controller_public_ips: true - id: slurm-login - source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: - - slurm-controller - network1 settings: + name_prefix: login disable_login_public_ips: true From dc56286e9288569201c5724e29eee62781b0b010 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 26 Jan 2024 23:06:16 +0000 Subject: [PATCH 099/151] Ensure Windows VMs start HTCondor only after successful secret download - this enables Managed Instance Group health checks to mark the node unhealthy for deletion --- .../templates/download-condor-config.ps1.tftpl | 5 +++++ .../htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl | 5 +++++ .../htcondor-install/templates/install-htcondor.ps1.tftpl | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl index d31bdf2faa..c69d6022c6 100644 --- a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl +++ b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl @@ -18,3 +18,8 @@ if ($local_hash -cne $remote_hash) { gcloud storage cp ${config_object} $config_file Restart-Service condor } + +# ignored if service is already running; must be here to handle case where +# machine is rebooted, but configuration has previously been downloaded +# and service is disabled from automatic start +Start-Service condor diff --git a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl index ec6773053c..838d268162 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl +++ b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl @@ -1,3 +1,6 @@ +Set-StrictMode -Version latest +$ErrorActionPreference = 'Stop' + $config_dir = 'C:\Condor\config' if(!(test-path -PathType container -Path $config_dir)) { New-Item -ItemType Directory -Path $config_dir @@ -15,3 +18,5 @@ Set-Content -Path "$config_file" -Value "$config_string" # obtain IDTOKEN for authentication by StartD to Central Manager gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` --out-file C:\condor\tokens.d\condor@${trust_domain} + +if ($LASTEXITCODE -ne 0) { throw "Could not download HTCondor IDTOKEN; exiting startup script" } diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 54b6d35bba..79941524dd 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -29,12 +29,17 @@ $args=$args + ' INSTALLDIR="C:\Condor"' Start-Process "msiexec.exe" -Wait -ArgumentList "$args" Remove-Item "$htcondor_installer" +# do not start HTCondor on boot by default. Allow startup script to download +# configuration first and then start HTCondor +Set-Service -StartupType Manual condor + # remove settings from condor_config that we want to override in configuration step Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^CONDOR_HOST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^INSTALL_USER' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^DAEMON_LIST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^use SECURITY' -NotMatch) +# install Python so that custom ClassAd hooks can execute $python_installer = 'C:\python-installer.exe' Invoke-WebRequest -Uri "https://www.python.org/ftp/python/3.11.4/python-3.11.4-amd64.exe" -OutFile "$python_installer" Start-Process -FilePath "$python_installer" -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1 Include_test=0' From 711eeeb210cfc5baeff92d1997d88dc5704d37e7 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 26 Jan 2024 23:11:26 +0000 Subject: [PATCH 100/151] Updated legacy-sharedvpc reference naming to sharedvpc --- ...slurm-legacy-sharedvpc.yaml => hpc-slurm-sharedvpc.yaml} | 0 examples/README.md | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename community/examples/{hpc-slurm-legacy-sharedvpc.yaml => hpc-slurm-sharedvpc.yaml} (100%) diff --git a/community/examples/hpc-slurm-legacy-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml similarity index 100% rename from community/examples/hpc-slurm-legacy-sharedvpc.yaml rename to community/examples/hpc-slurm-sharedvpc.yaml diff --git a/examples/README.md b/examples/README.md index 0038bad968..05da25a443 100644 --- a/examples/README.md +++ b/examples/README.md @@ -25,7 +25,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] - * [hpc-slurm-legacy-sharedvpc.yaml](#hpc-slurm-legacy-sharedvpcyaml-) ![community-badge] + * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml-) ![community-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] * [omnia-cluster.yaml](#omnia-clusteryaml--) ![community-badge] ![experimental-badge] @@ -977,7 +977,7 @@ See [README](../community/examples/flux-framework/README.md) [flux-cluster.yaml]: ../community/examples/flux-framework/flux-cluster.yaml -### [hpc-slurm-legacy-sharedvpc.yaml] ![community-badge] +### [hpc-slurm-sharedvpc.yaml] ![community-badge] This blueprint demonstrates the use of the Slurm and Filestore modules in the service project of an existing Shared VPC. Before attempting to deploy the @@ -989,7 +989,7 @@ the controller and login nodes. Also since this blueprint doesn't use external IPs for compute nodes, one must needs to [set up cloud nat][cloudnat] and [set up iap][iap]. -[hpc-slurm-legacy-sharedvpc.yaml]: ../community/examples/hpc-slurm-legacy-sharedvpc.yaml +[hpc-slurm-sharedvpc.yaml]: ../community/examples/hpc-slurm-sharedvpc.yaml [fs-shared-vpc]: https://cloud.google.com/filestore/docs/shared-vpc ## Blueprint Schema From 5b43060706d205da4fc63974f6711004d1bfc583 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:13:52 +0000 Subject: [PATCH 101/151] Bump github.com/zclconf/go-cty from 1.14.1 to 1.14.2 Bumps [github.com/zclconf/go-cty](https://github.com/zclconf/go-cty) from 1.14.1 to 1.14.2. - [Release notes](https://github.com/zclconf/go-cty/releases) - [Changelog](https://github.com/zclconf/go-cty/blob/main/CHANGELOG.md) - [Commits](https://github.com/zclconf/go-cty/compare/v1.14.1...v1.14.2) --- updated-dependencies: - dependency-name: github.com/zclconf/go-cty dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 8df4663595..410f74ac3f 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.0 - github.com/zclconf/go-cty v1.14.1 + github.com/zclconf/go-cty v1.14.2 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa google.golang.org/genproto v0.0.0-20240102182953-50ed04b92917 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c diff --git a/go.sum b/go.sum index b561ae67b4..8a06adcda7 100644 --- a/go.sum +++ b/go.sum @@ -493,8 +493,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zclconf/go-cty v1.2.0/go.mod h1:hOPWgoHbaTUnI5k4D2ld+GRpFJSCe6bCM7m1q/N4PQ8= -github.com/zclconf/go-cty v1.14.1 h1:t9fyA35fwjjUMcmL5hLER+e/rEPqrbCK1/OSE4SI9KA= -github.com/zclconf/go-cty v1.14.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.14.2 h1:kTG7lqmBou0Zkx35r6HJHUQTvaRPr5bIAf3AoHS0izI= +github.com/zclconf/go-cty v1.14.2/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b h1:FosyBZYxY34Wul7O/MSKey3txpPYyCqVO5ZyceuQJEI= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b/go.mod h1:ZRKQfBXbGkpdV6QMzT3rU1kSTAnfu1dO8dPKjYprgj8= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= From c014b64ab0d0365a53b15f2cad6fac86493a1364 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:14:16 +0000 Subject: [PATCH 102/151] Bump google.golang.org/api from 0.157.0 to 0.159.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.157.0 to 0.159.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.157.0...v0.159.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index 8df4663595..6f17c57f8e 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.157.0 + google.golang.org/api v0.159.0 ) require ( @@ -37,18 +37,18 @@ require ( github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/go-logr/logr v1.3.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/googleapis/gax-go/v2 v2.12.0 // indirect github.com/hashicorp/terraform-json v0.19.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 // indirect - go.opentelemetry.io/otel v1.21.0 // indirect - go.opentelemetry.io/otel/metric v1.21.0 // indirect - go.opentelemetry.io/otel/trace v1.21.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 // indirect + go.opentelemetry.io/otel v1.22.0 // indirect + go.opentelemetry.io/otel/metric v1.22.0 // indirect + go.opentelemetry.io/otel/trace v1.22.0 // indirect golang.org/x/mod v0.14.0 // indirect golang.org/x/sync v0.6.0 // indirect golang.org/x/time v0.5.0 // indirect diff --git a/go.sum b/go.sum index b561ae67b4..4758830494 100644 --- a/go.sum +++ b/go.sum @@ -269,8 +269,8 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= -github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-test/deep v1.0.3 h1:ZrJSEWsXzPOxaZnFteGEfooLba+ju3FYIbOrS+rQd68= @@ -506,17 +506,17 @@ go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1 h1:SpGay3w+nEwMpfVnbqOLH5gY52/foP8RE8UzTZ1pdSE= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1/go.mod h1:4UoMYEZOC0yN/sPGH76KPkkU7zgiEWYWL9vwmbnTJPE= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1 h1:aFJWCqJMNjENlcleuuOkGAPH82y0yULBScfXcIEdS24= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.46.1/go.mod h1:sEGXWArGqc3tVa+ekntsN65DmVbVeW+7lTKTjZF3/Fo= -go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc= -go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo= -go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4= -go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0 h1:UNQQKPfTDe1J81ViolILjTKPr9WetKW6uei2hFgJmFs= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.47.0/go.mod h1:r9vWsPS/3AQItv3OSlEJ/E4mbrhUbbw18meOjArPtKQ= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0 h1:sv9kVfal0MK0wBMCOGr+HeJm9v803BkJxGrk2au7j08= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.47.0/go.mod h1:SK2UL73Zy1quvRPonmOmRDiWk1KBV3LyIeeIxcEApWw= +go.opentelemetry.io/otel v1.22.0 h1:xS7Ku+7yTFvDfDraDIJVpw7XPyuHlB9MCiqqX5mcJ6Y= +go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI= +go.opentelemetry.io/otel/metric v1.22.0 h1:lypMQnGyJYeuYPhOM/bgjbFM6WE44W1/T45er4d8Hhg= +go.opentelemetry.io/otel/metric v1.22.0/go.mod h1:evJGjVpZv0mQ5QBRJoBF64yMuOf4xCWdXjK8pzFvliY= go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o= -go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc= -go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ= +go.opentelemetry.io/otel/trace v1.22.0 h1:Hg6pPujv0XG9QaVbGOBVHunyuLcCC3jN7WEhPx83XD0= +go.opentelemetry.io/otel/trace v1.22.0/go.mod h1:RbbHXVqKES9QhzZq/fE5UnOSILqRt40a21sPw2He1xo= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -880,8 +880,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.157.0 h1:ORAeqmbrrozeyw5NjnMxh7peHO0UzV4wWYSwZeCUb20= -google.golang.org/api v0.157.0/go.mod h1:+z4v4ufbZ1WEpld6yMGHyggs+PmAHiaLNj5ytP3N01g= +google.golang.org/api v0.159.0 h1:fVTj+7HHiUYz4JEZCHHoRIeQX7h5FMzrA2RF/DzDdbs= +google.golang.org/api v0.159.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= From c562370773d4a572d48a287b6c50135a97a7d518 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Mon, 29 Jan 2024 14:36:06 +0000 Subject: [PATCH 103/151] Modified validation message to be more clear --- .../scheduler/schedmd-slurm-gcp-v5-controller/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index f251c20765..9f3c5810ed 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -131,7 +131,7 @@ variable "login_startup_scripts_timeout" { validation { condition = var.login_startup_scripts_timeout == 300 - error_message = "Changes to login_startup_scripts_timeout (default: 300s) are not respected, this will be fixed in a later release" + error_message = "Changes to login_startup_scripts_timeout (default: 300s) are not respected, this is a known issue that will be fixed in a later release" } } From 20a3014166a0e2f73447883cd54d109ab36762b6 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 29 Jan 2024 10:56:45 -0600 Subject: [PATCH 104/151] Remove project_id from image building example --- community/examples/hpc-build-slurm-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 6a82fe8d9f..9facb79a3f 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -15,7 +15,7 @@ blueprint_name: hpc-build-slurm-image vars: - project_id: ns-playground-2023-01-19 ## Set GCP Project ID Here ## + project_id: ## Set GCP Project ID Here ## deployment_name: build-slurm-1 region: us-central1 zone: us-central1-a From e9f6607249bbea69661a69ca90bdbc9e49e80b3b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 29 Jan 2024 11:10:33 -0600 Subject: [PATCH 105/151] Reduce size of image builder to be compatible with initial projects --- community/examples/hpc-build-slurm-image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 9facb79a3f..b8a1cf8888 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -20,7 +20,7 @@ vars: region: us-central1 zone: us-central1-a - image_build_machine_type: n2d-standard-32 + image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public built_image_family: my-custom-slurm From 511e80cea8b8c6460ca931f8aed975ea090fd081 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 15 Jan 2024 23:21:33 +0000 Subject: [PATCH 106/151] Remove SLurm V4 modules and add note to use and reference V4 modules and examples --- .../SchedMD-slurm-on-gcp-partition/README.md | 98 ------ .../metadata.yaml | 19 -- .../SchedMD-slurm-on-gcp-partition/outputs.tf | 58 ---- .../variables.tf | 184 ----------- .../versions.tf | 19 -- .../SchedMD-slurm-on-gcp-controller/README.md | 132 -------- .../SchedMD-slurm-on-gcp-controller/main.tf | 90 ------ .../metadata.yaml | 19 -- .../outputs.tf | 20 -- .../variables.tf | 294 ------------------ .../versions.tf | 23 -- .../SchedMD-slurm-on-gcp-login-node/README.md | 119 ------- .../SchedMD-slurm-on-gcp-login-node/main.tf | 62 ---- .../metadata.yaml | 19 -- .../variables.tf | 186 ----------- .../versions.tf | 23 -- docs/gpu-support.md | 13 +- docs/slurm-troubleshooting.md | 8 +- modules/README.md | 17 +- modules/scripts/startup-script/README.md | 6 +- pkg/modulereader/metadata_legacy.go | 10 +- 21 files changed, 20 insertions(+), 1399 deletions(-) delete mode 100644 community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md delete mode 100644 community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml delete mode 100644 community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf delete mode 100644 community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf delete mode 100644 community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf delete mode 100644 community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md deleted file mode 100644 index e27e937920..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md +++ /dev/null @@ -1,98 +0,0 @@ -## Description - -> **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/README.md) and -> [schedmd-slurm-gcp-v5-node-group](../schedmd-slurm-gcp-v5-node-group/README.md) instead. - -This module creates a compute partition that be can used as input to -[SchedMD-slurm-on-gcp-controller](../../scheduler/SchedMD-slurm-on-gcp-controller/README.md). - -> **Warning**: updating a partition will not cause the slurm controller to -> update its configurations. In other words, it will not update an already -> deployed Slurm cluster. - -### Example - -The following code snippet creates a partition module with: - -* a max node count of 200 -* VM machine type of `c2-standard-30` -* partition name of "compute" -* connected to the `network1` module via `use` -* Mounted to homefs via `use` - -```yaml -- id: compute_partition - source: community/modules/compute/SchedMD-slurm-on-gcp-partition - use: [network1, homefs] - settings: - max_node_count: 200 - partition_name: compute - machine_type: c2-standard-30 -``` - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -No modules. - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | -| [compute\_disk\_size\_gb](#input\_compute\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes | `number` | `20` | no | -| [compute\_disk\_type](#input\_compute\_disk\_type) | Type of boot disk to create for the partition compute nodes | `string` | `"pd-standard"` | no | -| [cpu\_platform](#input\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [enable\_placement](#input\_enable\_placement) | Enable compact placement policies for jobs requiring low latency networking. | `bool` | `true` | no | -| [exclusive](#input\_exclusive) | Exclusive job access to nodes | `bool` | `true` | no | -| [gpu\_count](#input\_gpu\_count) | Number of GPUs attached to the partition compute instances | `number` | `0` | no | -| [gpu\_type](#input\_gpu\_type) | Type of GPUs attached to the partition compute instances | `string` | `null` | no | -| [image\_hyperthreads](#input\_image\_hyperthreads) | Enable hyperthreading | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used by the compute VMs in this partition.
Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_template](#input\_instance\_template) | Instance template to use to create partition instances | `string` | `null` | no | -| [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes | `string` | `"c2-standard-60"` | no | -| [max\_node\_count](#input\_max\_node\_count) | Maximum number of nodes allowed in this partition | `number` | `50` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [partition\_name](#input\_partition\_name) | The name of the slurm partition | `string` | n/a | yes | -| [preemptible\_bursting](#input\_preemptible\_bursting) | Should use preemptibles to burst | `string` | `false` | no | -| [regional\_capacity](#input\_regional\_capacity) | If True, then create instances in the region that has available capacity. Specify the region in the zone field. | `bool` | `false` | no | -| [regional\_policy](#input\_regional\_policy) | locationPolicy definition for regional bulkInsert() | `any` | `{}` | no | -| [static\_node\_count](#input\_static\_node\_count) | Number of nodes to be statically created | `number` | `0` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | n/a | yes | -| [zone](#input\_zone) | Compute Platform zone where the notebook server will be located | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [partition](#output\_partition) | The partition structure containing all the set variables | - diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml b/community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf deleted file mode 100644 index 4fc43627e3..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/outputs.tf +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-partition", ghpc_role = "compute" }) -} - -locals { - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - - -output "partition" { - description = "The partition structure containing all the set variables" - value = { - name : var.partition_name - machine_type : var.machine_type - static_node_count : var.static_node_count - max_node_count : var.max_node_count - zone : var.zone - image : local.instance_image - image_hyperthreads : var.image_hyperthreads - compute_disk_type : var.compute_disk_type - compute_disk_size_gb : var.compute_disk_size_gb - compute_labels : local.labels - cpu_platform : var.cpu_platform - gpu_count : var.gpu_count - gpu_type : var.gpu_type - network_storage : var.network_storage - preemptible_bursting : var.preemptible_bursting - vpc_subnet : var.subnetwork_name - exclusive : var.exclusive - enable_placement : var.enable_placement - regional_capacity : var.regional_capacity - regional_policy : var.regional_policy - bandwidth_tier : var.bandwidth_tier - instance_template : var.instance_template - } -} diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf deleted file mode 100644 index d94ee8f86a..0000000000 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/variables.tf +++ /dev/null @@ -1,184 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "partition_name" { - description = "The name of the slurm partition" - type = string -} - -variable "machine_type" { - description = "Compute Platform machine type to use for this partition compute nodes" - type = string - default = "c2-standard-60" -} - -variable "static_node_count" { - description = "Number of nodes to be statically created" - type = number - default = 0 -} - -variable "max_node_count" { - description = "Maximum number of nodes allowed in this partition" - type = number - default = 50 -} - -variable "zone" { - description = "Compute Platform zone where the notebook server will be located" - type = string -} - -variable "instance_image" { - description = <<-EOD - Defines the image that will be used by the compute VMs in this partition. - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "image_hyperthreads" { - description = "Enable hyperthreading" - type = bool - default = false -} - -variable "compute_disk_type" { - description = "Type of boot disk to create for the partition compute nodes" - type = string - default = "pd-standard" -} - -variable "compute_disk_size_gb" { - description = "Size of boot disk to create for the partition compute nodes" - type = number - default = 20 -} - -variable "labels" { - description = "Labels to add to partition compute instances. Key-value pairs." - type = map(string) - default = {} -} - -variable "cpu_platform" { - description = "The name of the minimum CPU platform that you want the instance to use." - type = string - default = null -} - -variable "gpu_count" { - description = "Number of GPUs attached to the partition compute instances" - type = number - default = 0 -} - -variable "gpu_type" { - description = "Type of GPUs attached to the partition compute instances" - type = string - default = null -} - -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on the partition compute nodes." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "preemptible_bursting" { - description = "Should use preemptibles to burst" - type = string - default = false -} - -variable "subnetwork_name" { - description = "The name of the pre-defined VPC subnet you want the nodes to attach to based on Region." - type = string -} - -variable "exclusive" { - description = "Exclusive job access to nodes" - type = bool - default = true -} - -variable "enable_placement" { - description = "Enable compact placement policies for jobs requiring low latency networking." - type = bool - default = true -} - -variable "regional_capacity" { - description = "If True, then create instances in the region that has available capacity. Specify the region in the zone field." - type = bool - default = false -} - -variable "regional_policy" { - description = "locationPolicy definition for regional bulkInsert()" - type = any - default = {} -} - -variable "bandwidth_tier" { - description = < **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-controller](../schedmd-slurm-gcp-v5-controller/README.md) instead. - -This module creates a slurm controller node via the SchedMD/slurm-gcp -[controller] module. - -More information about Slurm On GCP can be found at the -[project's GitHub page][slurm-on-gcp] and in the -[Slurm on Google Cloud User Guide][slurm-ug]. - -The [user guide][slurm-ug] provides detailed instructions on customizing and -enhancing the Slurm on GCP cluster as well as recommendations on configuring the -controller for optimal performance at different scales. - -[controller]: https://github.com/SchedMD/slurm-gcp/tree/v4.2.0/tf/modules/controller -[slurm-ug]: https://goo.gle/slurm-gcp-user-guide. - -### Example - -```yaml -- id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: - - network1 - - homefs - - compute_partition - settings: - login_node_count: 1 -``` - -This creates a controller node connected to the primary subnetwork with 1 login -node (defined elsewhere). The controller will also have the `homefs` file system -mounted via the `use` field and manage one partition, also declared in the `use` -field. - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - -## License - - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_cluster\_compute\_node](#module\_slurm\_cluster\_compute\_node) | github.com/SchedMD/slurm-gcp//tf/modules/compute/ | v4.2.1 | -| [slurm\_cluster\_controller](#module\_slurm\_cluster\_controller) | github.com/SchedMD/slurm-gcp//tf/modules/controller/ | v4.2.1 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [boot\_disk\_size](#input\_boot\_disk\_size) | Size of boot disk to create for the cluster controller node | `number` | `50` | no | -| [boot\_disk\_type](#input\_boot\_disk\_type) | Type of boot disk to create for the cluster controller node.
Choose from: pd-ssd, pd-standard, pd-balanced, pd-extreme.
pd-ssd is recommended if the controller is hosting the SlurmDB and NFS share.
If SlurmDB and NFS share are not running on the controller, pd-standard is
recommended. See "Controller configuration recommendations" in the Slurm on
Google Cloud User Guide for more information:
https://goo.gle/slurm-gcp-user-guide | `string` | `"pd-ssd"` | no | -| [cloudsql](#input\_cloudsql) | Define an existing CloudSQL instance to use instead of instance-local MySQL |
object({
server_ip = string,
user = string,
password = string,
db_name = string
})
| `null` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the cluster | `string` | `null` | no | -| [compute\_node\_scopes](#input\_compute\_node\_scopes) | Scopes to apply to compute nodes. | `list(string)` |
[
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [compute\_node\_service\_account](#input\_compute\_node\_service\_account) | Service Account for compute nodes. | `string` | `null` | no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Custom startup script to run on the compute nodes | `string` | `null` | no | -| [controller\_instance\_template](#input\_controller\_instance\_template) | Instance template to use to create controller instance | `string` | `null` | no | -| [controller\_machine\_type](#input\_controller\_machine\_type) | Compute Platform machine type to use in controller node creation. `c2-standard-4`
is recommended for clusters up to 50 nodes, for larger clusters see
"Controller configuration recommendations" in the Slurm on Google Cloud User
Guide: https://goo.gle/slurm-gcp-user-guide | `string` | `"c2-standard-4"` | no | -| [controller\_scopes](#input\_controller\_scopes) | Scopes to apply to the controller | `list(string)` |
[
"https://www.googleapis.com/auth/cloud-platform",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [controller\_secondary\_disk](#input\_controller\_secondary\_disk) | Create secondary disk mounted to controller node | `bool` | `false` | no | -| [controller\_secondary\_disk\_size](#input\_controller\_secondary\_disk\_size) | Size of disk for the secondary disk | `number` | `100` | no | -| [controller\_secondary\_disk\_type](#input\_controller\_secondary\_disk\_type) | Disk type (pd-ssd or pd-standard) for secondary disk | `string` | `"pd-ssd"` | no | -| [controller\_service\_account](#input\_controller\_service\_account) | Service Account for the controller | `string` | `null` | no | -| [controller\_startup\_script](#input\_controller\_startup\_script) | Custom startup script to run on the controller | `string` | `null` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment | `string` | n/a | yes | -| [disable\_compute\_public\_ips](#input\_disable\_compute\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `true` | no | -| [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Slurm image to use for the controller instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [intel\_select\_solution](#input\_intel\_select\_solution) | Configure the cluster to meet the performance requirement of the Intel Select Solution | `string` | `null` | no | -| [jwt\_key](#input\_jwt\_key) | Specific libjwt key to use | `any` | `null` | no | -| [labels](#input\_labels) | Labels to add to controller instance. Key-value pairs. | `map(string)` | `{}` | no | -| [login\_node\_count](#input\_login\_node\_count) | Number of login nodes in the cluster | `number` | `0` | no | -| [munge\_key](#input\_munge\_key) | Specific munge key to use | `any` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [partition](#input\_partition) | An array of configurations for specifying multiple machine types residing in their own Slurm partitions. |
list(object({
name = string,
machine_type = string,
max_node_count = number,
zone = string,
image = string,
image_hyperthreads = bool,
compute_disk_type = string,
compute_disk_size_gb = number,
compute_labels = any,
cpu_platform = string,
gpu_type = string,
gpu_count = number,
network_storage = list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string
})),
preemptible_bursting = string,
vpc_subnet = string,
exclusive = bool,
enable_placement = bool,
regional_capacity = bool,
regional_policy = any,
instance_template = string,
bandwidth_tier = string,
static_node_count = number
}))
| n/a | yes | -| [project\_id](#input\_project\_id) | Compute Platform project that will host the Slurm cluster | `string` | n/a | yes | -| [region](#input\_region) | Compute Platform region where the Slurm cluster will be located | `string` | n/a | yes | -| [shared\_vpc\_host\_project](#input\_shared\_vpc\_host\_project) | Host project of shared VPC | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Custom startup script to run on compute nodes and controller.
`controller_startup_script` for the controller and `compute_startup_script` for compute nodes take presidence if specified.
This variable allows Slurm to [use](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules#use-optional) the [startup\_script](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/scripts/startup-script) module. | `string` | `null` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | `null` | no | -| [suspend\_time](#input\_suspend\_time) | Idle time (in sec) to wait before nodes go away | `number` | `300` | no | -| [zone](#input\_zone) | Compute Platform zone where the servers will be located | `string` | n/a | yes | - -## Outputs - -| Name | Description | -|------|-------------| -| [controller\_name](#output\_controller\_name) | Name of the controller node | - diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf deleted file mode 100644 index 76e74fca34..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-controller", ghpc_role = "scheduler" }) -} - -locals { - controller_startup_script = var.controller_startup_script != null ? var.controller_startup_script : var.startup_script - compute_startup_script = var.compute_startup_script != null ? var.compute_startup_script : var.startup_script - cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" - - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - -module "slurm_cluster_controller" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/controller/?ref=v4.2.1" - boot_disk_size = var.boot_disk_size - boot_disk_type = var.boot_disk_type - image = local.instance_image - instance_template = var.controller_instance_template - cluster_name = local.cluster_name - compute_node_scopes = var.compute_node_scopes - compute_node_service_account = var.compute_node_service_account - disable_compute_public_ips = var.disable_compute_public_ips - disable_controller_public_ips = var.disable_controller_public_ips - labels = local.labels - login_network_storage = var.network_storage - login_node_count = var.login_node_count - machine_type = var.controller_machine_type - munge_key = var.munge_key - jwt_key = var.jwt_key - network_storage = var.network_storage - partitions = var.partition - controller_startup_script = local.controller_startup_script - compute_startup_script = local.compute_startup_script - project = var.project_id - region = var.region - secondary_disk = var.controller_secondary_disk - secondary_disk_size = var.controller_secondary_disk_size - secondary_disk_type = var.controller_secondary_disk_type - shared_vpc_host_project = var.shared_vpc_host_project - scopes = var.controller_scopes - service_account = var.controller_service_account - subnetwork_name = var.subnetwork_name - suspend_time = var.suspend_time - zone = var.zone - intel_select_solution = var.intel_select_solution - cloudsql = var.cloudsql -} - -module "slurm_cluster_compute_node" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/compute/?ref=v4.2.1" - project = var.project_id - cluster_name = local.cluster_name - region = var.region - zone = var.zone - controller_name = module.slurm_cluster_controller.controller_node_name - controller_secondary_disk = var.controller_secondary_disk - disable_compute_public_ips = var.disable_compute_public_ips - network_storage = var.network_storage - partitions = var.partition - compute_startup_script = local.compute_startup_script - scopes = var.compute_node_scopes - service_account = var.compute_node_service_account - shared_vpc_host_project = var.shared_vpc_host_project - subnetwork_name = var.subnetwork_name - intel_select_solution = var.intel_select_solution - munge_key = var.munge_key -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf deleted file mode 100644 index 81be162e5b..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/outputs.tf +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "controller_name" { - description = "Name of the controller node" - value = module.slurm_cluster_controller.controller_node_name -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf deleted file mode 100644 index c5ce7900f3..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/variables.tf +++ /dev/null @@ -1,294 +0,0 @@ -# -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "boot_disk_size" { - description = "Size of boot disk to create for the cluster controller node" - type = number - default = 50 -} - -variable "boot_disk_type" { - description = <<-EOT - Type of boot disk to create for the cluster controller node. - Choose from: pd-ssd, pd-standard, pd-balanced, pd-extreme. - pd-ssd is recommended if the controller is hosting the SlurmDB and NFS share. - If SlurmDB and NFS share are not running on the controller, pd-standard is - recommended. See "Controller configuration recommendations" in the Slurm on - Google Cloud User Guide for more information: - https://goo.gle/slurm-gcp-user-guide - EOT - type = string - default = "pd-ssd" -} - -variable "instance_image" { - description = <<-EOD - Slurm image to use for the controller instance. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "controller_instance_template" { - description = "Instance template to use to create controller instance" - type = string - default = null -} - -variable "cluster_name" { - description = "Name of the cluster" - type = string - default = null -} - -variable "deployment_name" { - description = "Name of the deployment" - type = string -} - -variable "compute_node_scopes" { - description = "Scopes to apply to compute nodes." - type = list(string) - default = [ - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/devstorage.read_only", - ] -} - -variable "compute_node_service_account" { - description = "Service Account for compute nodes." - type = string - default = null -} - -variable "disable_controller_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = false -} - -variable "disable_compute_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = true -} - -variable "labels" { - description = "Labels to add to controller instance. Key-value pairs." - type = map(string) - default = {} -} - -variable "login_node_count" { - description = "Number of login nodes in the cluster" - type = number - default = 0 -} - -variable "controller_machine_type" { - description = <<-EOT - Compute Platform machine type to use in controller node creation. `c2-standard-4` - is recommended for clusters up to 50 nodes, for larger clusters see - "Controller configuration recommendations" in the Slurm on Google Cloud User - Guide: https://goo.gle/slurm-gcp-user-guide - EOT - type = string - default = "c2-standard-4" -} - -variable "munge_key" { - description = "Specific munge key to use" - type = any - default = null -} - -variable "jwt_key" { - description = "Specific libjwt key to use" - type = any - default = null -} - -variable "network_storage" { - description = "An array of network attached storage mounts to be configured on all instances." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "partition" { - description = "An array of configurations for specifying multiple machine types residing in their own Slurm partitions." - type = list(object({ - name = string, - machine_type = string, - max_node_count = number, - zone = string, - image = string, - image_hyperthreads = bool, - compute_disk_type = string, - compute_disk_size_gb = number, - compute_labels = any, - cpu_platform = string, - gpu_type = string, - gpu_count = number, - network_storage = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string - })), - preemptible_bursting = string, - vpc_subnet = string, - exclusive = bool, - enable_placement = bool, - regional_capacity = bool, - regional_policy = any, - instance_template = string, - bandwidth_tier = string, - static_node_count = number - })) -} - -variable "controller_startup_script" { - description = "Custom startup script to run on the controller" - type = string - default = null -} - -variable "compute_startup_script" { - description = "Custom startup script to run on the compute nodes" - type = string - default = null -} - -variable "startup_script" { - description = < **Warning**: this module is now deprecated. We recommend using the Slurm on GCP V5 -> [schedmd-slurm-gcp-v5-login](../schedmd-slurm-gcp-v5-login/README.md) instead. - -This module creates a login node for a Slurm cluster based on the -[Slurm on GCP][slurm-on-gcp] terraform [login module][login-module]. The login -node is used in conjunction with the -[Slurm controller](../SchedMD-slurm-on-gcp-controller). - -> **_Warning:_**: Slurm handles startup scripts differently from virtual -> machines. This will not work in conjunction with the -> [startup_script](../../../scripts/startup-script/README.md) module. - -[login-module]: https://github.com/SchedMD/slurm-gcp/tree/master/tf/modules/login - -### Example - -```yaml -- id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: - - network1 - - homefs - - slurm_controller - settings: - login_machine_type: n2-standard-4 -``` - -This creates a Slurm login node which is: - -* connected to the primary subnet of network1 via `use` -* mounted to the homefs filesystem via `use` -* associated with the `slurm_controller` module as the slurm controller via - `use` -* of VM machine type `n2-standard-4` - -## GPU Support - -More information on GPU support in Slurm on GCP and other HPC Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform - -## License - - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_cluster\_login\_node](#module\_slurm\_cluster\_login\_node) | github.com/SchedMD/slurm-gcp//tf/modules/login/ | v4.2.1 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [boot\_disk\_size](#input\_boot\_disk\_size) | Size of boot disk to create for the cluster login node | `number` | `20` | no | -| [boot\_disk\_type](#input\_boot\_disk\_type) | Type of boot disk to create for the cluster login node | `string` | `"pd-standard"` | no | -| [cluster\_name](#input\_cluster\_name) | Name of the cluster | `string` | `null` | no | -| [controller\_name](#input\_controller\_name) | FQDN or IP address of the controller node | `string` | n/a | yes | -| [controller\_secondary\_disk](#input\_controller\_secondary\_disk) | Create secondary disk mounted to controller node | `bool` | `false` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment | `string` | n/a | yes | -| [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to true, create Cloud NAT gateway and enable IAP FW rules | `bool` | `false` | no | -| [instance\_image](#input\_instance\_image) | Disk OS image with Slurm preinstalled to use for login node.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.
Custom images must comply with Slurm on GCP requirements. | `map(string)` |
{
"family": "schedmd-slurm-21-08-8-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [labels](#input\_labels) | Labels to add to login instances. Key-value pairs. | `map(string)` | `{}` | no | -| [login\_instance\_template](#input\_login\_instance\_template) | Instance template to use to create controller instance | `string` | `null` | no | -| [login\_machine\_type](#input\_login\_machine\_type) | Machine type to use for login node instances. | `string` | `"n2-standard-2"` | no | -| [login\_node\_count](#input\_login\_node\_count) | Number of login nodes in the cluster | `number` | `1` | no | -| [login\_scopes](#input\_login\_scopes) | Scopes to apply to login nodes. | `list(string)` |
[
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/devstorage.read_only"
]
| no | -| [login\_service\_account](#input\_login\_service\_account) | Service Account for compute nodes. | `string` | `null` | no | -| [login\_startup\_script](#input\_login\_startup\_script) | Custom startup script to run on the login node | `string` | `null` | no | -| [munge\_key](#input\_munge\_key) | Specific munge key to use | `any` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [region](#input\_region) | Compute Platform region where the Slurm cluster will be located | `string` | n/a | yes | -| [shared\_vpc\_host\_project](#input\_shared\_vpc\_host\_project) | Host project of shared VPC | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Custom startup script to run on the login node.
Will be ignored if `login_startup_script` is specified.
This variable allows Slurm to [use](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules#use-optional) the [startup\_script](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/modules/scripts/startup-script) module. | `string` | `null` | no | -| [subnet\_depend](#input\_subnet\_depend) | Used as a dependency between the network and instances | `string` | `""` | no | -| [subnetwork\_name](#input\_subnetwork\_name) | The name of the pre-defined VPC subnet you want the nodes to attach to based on Region. | `string` | `null` | no | -| [zone](#input\_zone) | Compute Platform zone where the notebook server will be located | `string` | n/a | yes | - -## Outputs - -No outputs. - diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf deleted file mode 100644 index 076225cbe9..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/main.tf +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-on-gcp-login-node", ghpc_role = "scheduler" }) -} - -locals { - login_startup_script = var.login_startup_script != null ? var.login_startup_script : var.startup_script - - instance_name = lookup(var.instance_image, "name", null) - instance_family = lookup(var.instance_image, "family", null) - instance_image = ( - local.instance_name != null ? - "projects/${var.instance_image["project"]}/global/images/${local.instance_name}" : - "projects/${var.instance_image["project"]}/global/images/family/${local.instance_family}" - ) -} - -module "slurm_cluster_login_node" { - source = "github.com/SchedMD/slurm-gcp//tf/modules/login/?ref=v4.2.1" - boot_disk_size = var.boot_disk_size - boot_disk_type = var.boot_disk_type - image = local.instance_image - instance_template = var.login_instance_template - cluster_name = ( - var.cluster_name != null - ? var.cluster_name - : "slurm-${var.deployment_name}" - ) - controller_name = var.controller_name - controller_secondary_disk = var.controller_secondary_disk - disable_login_public_ips = var.disable_login_public_ips - labels = local.labels - login_network_storage = var.network_storage - machine_type = var.login_machine_type - munge_key = var.munge_key - network_storage = var.network_storage - node_count = var.login_node_count - region = var.region - scopes = var.login_scopes - service_account = var.login_service_account - shared_vpc_host_project = var.shared_vpc_host_project - subnet_depend = var.subnet_depend - subnetwork_name = var.subnetwork_name - zone = var.zone - login_startup_script = local.login_startup_script -} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf deleted file mode 100644 index 331ce1ab28..0000000000 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/variables.tf +++ /dev/null @@ -1,186 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -variable "boot_disk_size" { - description = "Size of boot disk to create for the cluster login node" - type = number - default = 20 -} - -variable "boot_disk_type" { - description = "Type of boot disk to create for the cluster login node" - type = string - default = "pd-standard" -} - -variable "instance_image" { - description = <<-EOD - Disk OS image with Slurm preinstalled to use for login node. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - Custom images must comply with Slurm on GCP requirements. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "schedmd-slurm-21-08-8-hpc-centos-7" - } - - validation { - condition = length(var.instance_image) == 0 || ( - can(var.instance_image["family"]) || can(var.instance_image["name"])) == can(var.instance_image["project"]) - error_message = "The \"project\" is required if \"family\" or \"name\" are provided in var.instance_image." - } - validation { - condition = length(var.instance_image) == 0 || can(var.instance_image["family"]) != can(var.instance_image["name"]) - error_message = "Exactly one of \"family\" and \"name\" must be provided in var.instance_image." - } -} - -variable "login_instance_template" { - description = "Instance template to use to create controller instance" - type = string - default = null -} - -variable "cluster_name" { - description = "Name of the cluster" - type = string - default = null -} - -variable "controller_name" { - description = "FQDN or IP address of the controller node" - type = string -} - -variable "controller_secondary_disk" { - description = "Create secondary disk mounted to controller node" - type = bool - default = false -} - -variable "deployment_name" { - description = "Name of the deployment" - type = string -} - -variable "disable_login_public_ips" { - description = "If set to true, create Cloud NAT gateway and enable IAP FW rules" - type = bool - default = false -} - -variable "labels" { - description = "Labels to add to login instances. Key-value pairs." - type = map(string) - default = {} -} - -variable "login_machine_type" { - description = "Machine type to use for login node instances." - type = string - default = "n2-standard-2" -} - -variable "munge_key" { - description = "Specific munge key to use" - type = any - default = null -} - -variable "network_storage" { - description = " An array of network attached storage mounts to be configured on all instances." - type = list(object({ - server_ip = string, - remote_mount = string, - local_mount = string, - fs_type = string, - mount_options = string, - client_install_runner = map(string) - mount_runner = map(string) - })) - default = [] -} - -variable "login_node_count" { - description = "Number of login nodes in the cluster" - type = number - default = 1 -} - -variable "region" { - description = "Compute Platform region where the Slurm cluster will be located" - type = string -} - -variable "login_scopes" { - description = "Scopes to apply to login nodes." - type = list(string) - default = [ - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/devstorage.read_only", - ] -} - -variable "login_service_account" { - description = "Service Account for compute nodes." - type = string - default = null -} - -variable "shared_vpc_host_project" { - description = "Host project of shared VPC" - type = string - default = null -} - -variable "subnet_depend" { - description = "Used as a dependency between the network and instances" - type = string - default = "" -} - -variable "subnetwork_name" { - description = "The name of the pre-defined VPC subnet you want the nodes to attach to based on Region." - type = string - default = null -} - -variable "zone" { - description = "Compute Platform zone where the notebook server will be located" - type = string -} - -variable "login_startup_script" { - description = "Custom startup script to run on the login node" - type = string - default = null -} - -variable "startup_script" { - description = < **_NOTE:_** Slurm V4 is deprecated. In case, you want to use V4 modules, please use +[ghpc-v1.27.0](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.27.0) +source code and build ghpc binary from this. This source code also contains +deprecated examples using V4 modules for your reference. + ## Module Fields ### ID (Required) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index e88bd6301f..36730850b9 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -104,12 +104,12 @@ and therefore must have access to GCS. > `https://www.googleapis.com/auth/devstorage.read_only`. > > This is set as a default scope in the [vm-instance], -> [SchedMD-slurm-on-gcp-login-node] and [SchedMD-slurm-on-gcp-controller] +> [schedMD-slurm-on-gcp-login-node] and [schedMD-slurm-on-gcp-controller] > modules [vm-instance]: ../../compute/vm-instance/README.md -[SchedMD-slurm-on-gcp-login-node]: ../../../community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md -[SchedMD-slurm-on-gcp-controller]: ../../../community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +[schedMD-slurm-on-gcp-login-node]: ../../../community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +[schedMD-slurm-on-gcp-controller]: ../../../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md ### Tracking startup script execution diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 08894b802e..2571d262fd 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -41,9 +41,6 @@ func defaultAPIList(source string) []string { // https://console.cloud.google.com/apis/dashboard and // https://console.cloud.google.com/apis/library staticAPIMap := map[string][]string{ - "community/modules/compute/SchedMD-slurm-on-gcp-partition": { - "compute.googleapis.com", - }, "community/modules/compute/htcondor-execute-point": { "compute.googleapis.com", "storage.googleapis.com", @@ -85,11 +82,10 @@ func defaultAPIList(source string) []string { "community/modules/project/service-enablement": { "serviceusage.googleapis.com", }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-controller": { - "compute.googleapis.com", - }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-login-node": { + "community/modules/scheduler/schedmd-slurm-gcp-v6-controller": { "compute.googleapis.com", + "iam.googleapis.com", + "storage.googleapis.com", }, "community/modules/compute/gke-node-pool": { "container.googleapis.com", From a2b76d1fd90c96fce4e61f5b92d2b6d8266835cd Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 29 Jan 2024 13:19:34 -0600 Subject: [PATCH 107/151] Patch Slurm integration test Retry initial node count until sinfo command is successful --- .../daily-tests/ansible_playbooks/slurm-integration-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index e26ef26c35..69d04de793 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -199,6 +199,9 @@ executable: /bin/bash changed_when: False register: initial_node_count + until: initial_node_count.rc == 0 + retries: 60 + delay: 15 - name: Run Integration tests for HPC toolkit ansible.builtin.include_tasks: "{{ test }}" From fc8360ff72de9c290192d7fb9c49089654495235 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 30 Jan 2024 13:16:09 -0600 Subject: [PATCH 108/151] Update Chrome Remote Desktop to Debian 12 by default --- .../remote-desktop/chrome-remote-desktop/README.md | 2 +- .../scripts/configure-grid-drivers.yml | 8 ++++++++ .../remote-desktop/chrome-remote-desktop/variables.tf | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index c1b3fb2bf7..aba3ea3250 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -85,7 +85,7 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is from
family= \"debian-11\" and project = \"debian-cloud\". An alternative image is
from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "debian-11",
"project": "debian-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is from
family= \"debian-12\" and project = \"debian-cloud\". An alternative image is
from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "debian-12",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml index e67f8f65d0..93efbfdb7e 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml +++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml @@ -26,6 +26,14 @@ - gdm3 grid_fn: NVIDIA-Linux-x86_64-510.85.02-grid.run grid_ver: vGPU14.2 + bookworm: + packages: + - build-essential + - gdebi-core + - mesa-utils + - gdm3 + grid_fn: NVIDIA-Linux-x86_64-535.154.05-grid.run + grid_ver: vGPU16.3 jammy: packages: - build-essential diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index 276470a575..ba1fa5ea52 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -58,7 +58,7 @@ variable "network_storage" { variable "instance_image" { description = <<-EOD Image used to build chrome remote desktop node. The default image is from - family= \"debian-11\" and project = \"debian-cloud\". An alternative image is + family= \"debian-12\" and project = \"debian-cloud\". An alternative image is from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\". Expected Fields: @@ -69,7 +69,7 @@ variable "instance_image" { type = map(string) default = { project = "debian-cloud" - family = "debian-11" + family = "debian-12" } } From 1e69df492e0bfa2e388a1fc81c01d37f81c1c466 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 30 Jan 2024 19:30:28 +0000 Subject: [PATCH 109/151] Make TPU non-preemptible in blueprint and add retries in JAX verification integration test --- community/examples/hpc-slurm6-tpu.yaml | 6 +++--- .../daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index 4a5dd3c3c3..f322825066 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -36,10 +36,10 @@ deployment_groups: node_type: v2-8 tf_version: 2.10.0 disable_public_ips: false - # To specify if TPU nodes are preemptible. The nodes will be shut down if - # it requires additional resources. + # Preemptible TPUs cost much less than non-preemptible TPUs. + # The Cloud TPU service might preempt (shut down) these TPUs at any time. # https://cloud.google.com/tpu/docs/preemptible - preemptible: true + preemptible: false # Specify whether to preserve TPU on suspend. # If set to true, suspended VM will be stopped. # If set to false, suspended VM will be deleted. diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml index 6e845bd9de..4b0084e628 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml @@ -44,6 +44,8 @@ - name: Run JAX verification register: jax_status failed_when: jax_status.rc != 0 + retries: 3 + delay: 100 ansible.builtin.command: | srun -N 1 -p tpu bash -c ' pip install --upgrade 'jax[tpu]>0.3.0' -f https://storage.googleapis.com/jax-releases/libtpu_releases.html From ed94f893e274644a47d15a0beb6756803a209418 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 30 Jan 2024 16:40:48 -0600 Subject: [PATCH 110/151] Update startup-script module to latest release --- community/modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/main.tf | 2 +- community/modules/compute/pbspro-execution/README.md | 2 +- community/modules/compute/pbspro-execution/main.tf | 2 +- .../modules/remote-desktop/chrome-remote-desktop/README.md | 2 +- community/modules/remote-desktop/chrome-remote-desktop/main.tf | 2 +- community/modules/scheduler/htcondor-access-point/README.md | 2 +- community/modules/scheduler/htcondor-access-point/main.tf | 2 +- community/modules/scheduler/htcondor-central-manager/README.md | 2 +- community/modules/scheduler/htcondor-central-manager/main.tf | 2 +- community/modules/scheduler/pbspro-client/README.md | 2 +- community/modules/scheduler/pbspro-client/main.tf | 2 +- community/modules/scheduler/pbspro-server/README.md | 2 +- community/modules/scheduler/pbspro-server/main.tf | 2 +- community/modules/scripts/ramble-execute/README.md | 2 +- community/modules/scripts/ramble-execute/main.tf | 2 +- community/modules/scripts/ramble-setup/README.md | 2 +- community/modules/scripts/ramble-setup/main.tf | 2 +- community/modules/scripts/spack-execute/README.md | 2 +- community/modules/scripts/spack-execute/main.tf | 2 +- community/modules/scripts/spack-setup/README.md | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- modules/compute/vm-instance/README.md | 2 +- modules/compute/vm-instance/startup_from_network_storage.tf | 2 +- modules/scheduler/batch-job-template/README.md | 2 +- .../batch-job-template/startup_from_network_storage.tf | 2 +- modules/scheduler/batch-login-node/README.md | 2 +- modules/scheduler/batch-login-node/main.tf | 2 +- 28 files changed, 28 insertions(+), 28 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 098abc2c4f..41219ca73f 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -212,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | | [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index c6abbbf8b8..4a13c3dced 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 0972b241bd..758428e0d2 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index 18323b5ef5..92c269cf33 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index c1b3fb2bf7..13441e14f9 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,7 +63,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 98d8c83568..e90820bfc8 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 05bec0c2ba..4d7fe60f1c 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -122,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 94da6e7399..142a16eb35 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -143,7 +143,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 6fe5256f15..0912b0b1c5 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -108,7 +108,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index b4c03acf77..a47cb0ed49 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -110,7 +110,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 92c684d4d6..1d40e411bd 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | | [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index 82e335936c..5801bc15c0 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 64911acfe4..9e1b047655 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -72,7 +72,7 @@ No providers. | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | | [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.22.1 | | [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | bb47067 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 622a924dff..250e1ac9fc 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index 2af5b2218d..f0a50ccb2b 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index 99af3cfa66..0958ebbb3c 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index f8d52c336c..61ead2de0d 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 16a0fcf29d..5be88a4442 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -94,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 99a375d29a..5e789b2808 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index d432041841..e3706b674a 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 55314173fe..ec2bd4a38c 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -336,7 +336,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index af10cfc53e..69b303f34f 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -100,7 +100,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 4728f60438..75558be86f 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 506722d115..1febd98c9d 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index f24b294c9c..8d343212c5 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -135,7 +135,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 506722d115..1febd98c9d 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 43469c7257..86c14f93ef 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 50644b2 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.27.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index f5f013cb9a..9f2cba6181 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=50644b2" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.27.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name From b68f3b4fb72a1e474b4abeaf15541f4168641a40 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 30 Jan 2024 15:04:56 -0800 Subject: [PATCH 111/151] Bump `pkg/modulereader` test coverage 80% -> 87% (#2161) --- pkg/modulereader/hcl_utils_test.go | 47 ++++++++++++++++++++++++++++++ pkg/modulereader/resreader_test.go | 5 ++++ 2 files changed, 52 insertions(+) diff --git a/pkg/modulereader/hcl_utils_test.go b/pkg/modulereader/hcl_utils_test.go index 94a0123847..13f038a067 100644 --- a/pkg/modulereader/hcl_utils_test.go +++ b/pkg/modulereader/hcl_utils_test.go @@ -16,7 +16,9 @@ package modulereader import ( "os" + "testing" + "github.com/zclconf/go-cty/cty" . "gopkg.in/check.v1" ) @@ -50,3 +52,48 @@ func (s *zeroSuite) TestReadHclAtttributes(c *C) { _, err = ReadHclAttributes(fn.Name()) c.Assert(err, NotNil) } + +func TestReplaceTokens(t *testing.T) { + type test struct { + ty string + err bool + want cty.Type + } + tests := []test{ + {"", false, cty.DynamicPseudoType}, + + {"string", false, cty.String}, + + {"list", false, cty.List(cty.DynamicPseudoType)}, + {"list(string)", false, cty.List(cty.String)}, + {"list(any)", false, cty.List(cty.DynamicPseudoType)}, + + {"map", false, cty.Map(cty.DynamicPseudoType)}, + {"map(string)", false, cty.Map(cty.String)}, + {"map(any)", false, cty.Map(cty.DynamicPseudoType)}, + + {`object({sweet=string})`, false, + cty.Object(map[string]cty.Type{"sweet": cty.String})}, + {`object({sweet=optional(string)})`, false, + cty.ObjectWithOptionalAttrs(map[string]cty.Type{"sweet": cty.String}, []string{"sweet"})}, + {`object({sweet=optional(string, "caramel")})`, false, + cty.ObjectWithOptionalAttrs(map[string]cty.Type{"sweet": cty.String}, []string{"sweet"})}, + + {"for", true, cty.NilType}, + } + for _, tc := range tests { + t.Run(tc.ty, func(t *testing.T) { + got, err := GetCtyType(tc.ty) + if tc.err != (err != nil) { + t.Errorf("got unexpected error: %s", err) + } + if err != nil { + return + } + + if !got.Equals(tc.want) { + t.Errorf("\nwant: %#v\ngot: %#v", tc.want, got) + } + }) + } +} diff --git a/pkg/modulereader/resreader_test.go b/pkg/modulereader/resreader_test.go index 2db972b52f..8397e71515 100644 --- a/pkg/modulereader/resreader_test.go +++ b/pkg/modulereader/resreader_test.go @@ -265,3 +265,8 @@ func (s *zeroSuite) TestUnmarshalOutputInfo(c *C) { y = "{ name: foo, description: bar, sensitive: contingent }" c.Check(yaml.Unmarshal([]byte(y), &oinfo), NotNil) } + +func (s *zeroSuite) TestLegacyMetadata(c *C) { + mi := legacyMetadata("community/modules/compute/SchedMD-slurm-on-gcp-partition") + c.Check(mi.Spec.Requirements.Services, DeepEquals, []string{"compute.googleapis.com"}) +} From af5fd401aa9a8581a7c2988d5966583b068c9da7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jan 2024 16:16:52 -0800 Subject: [PATCH 112/151] Fix test broken by remove module. (#2186) --- pkg/modulereader/resreader_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/modulereader/resreader_test.go b/pkg/modulereader/resreader_test.go index 8397e71515..6dd6e9e12a 100644 --- a/pkg/modulereader/resreader_test.go +++ b/pkg/modulereader/resreader_test.go @@ -266,7 +266,7 @@ func (s *zeroSuite) TestUnmarshalOutputInfo(c *C) { c.Check(yaml.Unmarshal([]byte(y), &oinfo), NotNil) } -func (s *zeroSuite) TestLegacyMetadata(c *C) { - mi := legacyMetadata("community/modules/compute/SchedMD-slurm-on-gcp-partition") - c.Check(mi.Spec.Requirements.Services, DeepEquals, []string{"compute.googleapis.com"}) +func (s *zeroSuite) TestLegacyMetadata(c *C) { // dummy test for sake of coverage + mi := legacyMetadata("modules/arbuz/velikan") + c.Check(mi.Spec.Requirements.Services, DeepEquals, []string{}) } From b0054f2d866184f70efd6a005daf2f9f932e7bd3 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 1 Feb 2024 18:56:56 +0000 Subject: [PATCH 113/151] Update TPU v6 blueprint to use new VPC module --- community/examples/hpc-slurm6-tpu.yaml | 3 +-- .../schedmd-slurm-gcp-v6-controller/README.md | 22 +++++++++---------- .../controller.tf | 8 +++---- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 8 +++---- .../slurm_files.tf | 2 +- .../daily-tests/tests/slurm-v6-tpu.yml | 2 +- 7 files changed, 24 insertions(+), 25 deletions(-) diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index f322825066..ed3ccd449e 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -26,7 +26,7 @@ deployment_groups: - group: primary modules: - id: network - source: modules/network/pre-existing-vpc + source: modules/network/vpc - id: tpu_nodeset source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu @@ -35,7 +35,6 @@ deployment_groups: name: v2x8 node_type: v2-8 tf_version: 2.10.0 - disable_public_ips: false # Preemptible TPUs cost much less than non-preemptible TPUs. # The Cloud TPU service might preempt (shut down) these TPUs at any time. # https://cloud.google.com/tpu/docs/preemptible diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 2c980163e9..fd7033b56f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.2 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.2 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.2 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.2 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.2 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.2 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.2 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.2 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.2 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.3 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.3 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.3 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.3 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.3 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.3 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.3 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.3 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 55337b4ec4..fd57b25818 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.3" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -148,7 +148,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.3" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name @@ -164,7 +164,7 @@ module "cleanup_compute_nodes" { # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.3" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 0d51129631..6936843c90 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -59,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.3" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 539205d08e..7946b162d0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" for_each = local.nodeset_map project_id = var.project_id @@ -60,7 +60,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.3" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -79,7 +79,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.3" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -101,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.3" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index f20d884937..7e7dabc605 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.3" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml index 82f8427ff1..15e0c4d61e 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml @@ -27,7 +27,7 @@ cli_deployment_vars: zone: us-central1-b workspace: /workspace blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6-tpu.yaml" -network: "default" +network: "{{ deployment_name }}-net" max_nodes: 5 # Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. login_node: "{{ slurm_cluster_name }}-login-*" From a22653b29ee25f32a9e6820fca4cd716ddc527f7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 1 Feb 2024 20:43:54 -0800 Subject: [PATCH 114/151] Improve test coverage of `pkg/modulewriter` (#2188) --- pkg/modulewriter/modulewriter.go | 6 +--- pkg/modulewriter/modulewriter_test.go | 51 ++++++++++++--------------- pkg/modulewriter/packerwriter.go | 9 +---- pkg/modulewriter/tfwriter.go | 18 +++------- 4 files changed, 30 insertions(+), 54 deletions(-) diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index b662fd32b2..6a1f310607 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -333,10 +333,7 @@ func prepArtifactsDir(artifactsDir string) error { defer f.Close() _, err = f.WriteString(artifactsWarning) - if err != nil { - return err - } - return nil + return err } func writeExpandedBlueprint(depDir string, dc config.DeploymentConfig) error { @@ -366,7 +363,6 @@ func writeDestroyInstructions(w io.Writer, dc config.DeploymentConfig, deploymen } if grp.Kind() == config.PackerKind { packerManifests = append(packerManifests, filepath.Join(grpPath, string(grp.Modules[0].ID), "packer-manifest.json")) - } } diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 9e9ce55b78..115a1eaa6c 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -460,44 +460,39 @@ func (s *MySuite) TestWriteDeploymentGroup_PackerWriter(c *C) { deploymentio := deploymentio.GetDeploymentioLocal() testWriter := PackerWriter{} - // No Packer modules - deploymentName := "deployment_TestWriteModuleLevel_PackerWriter" - deploymentDir := filepath.Join(s.testDir, deploymentName) - if err := deploymentio.CreateDirectory(deploymentDir); err != nil { - c.Fatal(err) - } - groupDir := filepath.Join(deploymentDir, "packerGroup") - if err := deploymentio.CreateDirectory(groupDir); err != nil { - c.Fatal(err) - } - moduleDir := filepath.Join(groupDir, "testPackerModule") - if err := deploymentio.CreateDirectory(moduleDir); err != nil { - c.Fatal(err) - } + otherMod := config.Module{ID: "tortoise"} - testPackerModule := config.Module{ + mod := config.Module{ Kind: config.PackerKind, - ID: "testPackerModule", - } - testDeploymentGroup := config.DeploymentGroup{ - Name: "packerGroup", - Modules: []config.Module{testPackerModule}, + ID: "prince", + Settings: config.NewDict(map[string]cty.Value{ + "zebra": cty.StringVal("checker"), // const + "salmon": config.GlobalRef("golf").AsExpression().AsValue(), // var + "bear": config.Reference{Module: otherMod.ID, Name: "rome"}.AsExpression().AsValue(), // IGC + }), } - testDC := config.DeploymentConfig{ + dc := config.DeploymentConfig{ Config: config.Blueprint{ + Vars: config.NewDict(map[string]cty.Value{ + "golf": cty.NumberIntVal(17), + }), DeploymentGroups: []config.DeploymentGroup{ - testDeploymentGroup, + {Name: "bread", Modules: []config.Module{otherMod}}, + {Name: "green", Modules: []config.Module{mod}}, }, }, } - f, err := os.CreateTemp("", "tmpf") - if err != nil { - c.Fatal() + + dir := c.MkDir() + moduleDir := filepath.Join(dir, string(mod.ID)) + if err := deploymentio.CreateDirectory(moduleDir); err != nil { + c.Fatal(err) } - defer os.Remove(f.Name()) - testWriter.writeDeploymentGroup(testDC, 0, groupDir, f) - _, err = os.Stat(filepath.Join(moduleDir, packerAutoVarFilename)) + instructions := new(strings.Builder) + + c.Assert(testWriter.writeDeploymentGroup(dc, 1, dir, instructions), IsNil) + _, err := os.Stat(filepath.Join(moduleDir, packerAutoVarFilename)) c.Assert(err, IsNil) } diff --git a/pkg/modulewriter/packerwriter.go b/pkg/modulewriter/packerwriter.go index d45fcdd767..81bd1985ef 100644 --- a/pkg/modulewriter/packerwriter.go +++ b/pkg/modulewriter/packerwriter.go @@ -61,21 +61,14 @@ func (w PackerWriter) writeDeploymentGroup( instructionsFile io.Writer, ) error { depGroup := dc.Config.DeploymentGroups[grpIdx] - igcInputs := map[string]bool{} for _, mod := range depGroup.Modules { pure := config.Dict{} for setting, v := range mod.Settings.Items() { - igcRefs := config.FindIntergroupReferences(v, mod, dc.Config) - if len(igcRefs) == 0 { + if len(config.FindIntergroupReferences(v, mod, dc.Config)) == 0 { pure.Set(setting, v) } - for _, r := range igcRefs { - n := config.AutomaticOutputName(r.Name, r.Module) - igcInputs[n] = true - } } - av, err := pure.Eval(dc.Config) if err != nil { return err diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index dc1a9975a2..22da6c2114 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -349,24 +349,16 @@ func orderKeys[T any](settings map[string]T) []string { } func getUsedDeploymentVars(group config.DeploymentGroup, bp config.Blueprint) map[string]cty.Value { - // labels must always be written as a variable as it is implicitly added - groupInputs := map[string]bool{ - "labels": true, + res := map[string]cty.Value{ + // labels must always be written as a variable as it is implicitly added + "labels": bp.Vars.Get("labels"), } - for _, mod := range group.Modules { for _, v := range config.GetUsedDeploymentVars(mod.Settings.AsObject()) { - groupInputs[v] = true + res[v] = bp.Vars.Get(v) } } - - filteredVars := make(map[string]cty.Value) - for key, val := range bp.Vars.Items() { - if groupInputs[key] { - filteredVars[key] = val - } - } - return filteredVars + return res } func substituteIgcReferences(mods []config.Module, igcRefs map[config.Reference]modulereader.VarInfo) ([]config.Module, error) { From 2ca0d09f07082cf6198685baa309ba720ade91c9 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 1 Feb 2024 19:52:06 +0000 Subject: [PATCH 115/151] Updating spack and ramble buckets to use 6 digits of hex --- community/modules/scripts/ramble-setup/main.tf | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 5be88a4442..65afeafb81 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -72,7 +72,7 @@ locals { "destination" = "install_ramble.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 4) + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 8) bucket_name = "ramble-scripts-${local.bucket_md5}" runners = [local.install_ramble_deps_runner, local.install_ramble_runner, local.python_reqs_runner] diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index 69b303f34f..2245718d75 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -79,7 +79,7 @@ locals { "destination" = "install_spack.yml" } - bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 4) + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}.${local.script_content}"), 0, 8) bucket_name = "spack-scripts-${local.bucket_md5}" runners = [local.install_spack_deps_runner, local.install_spack_runner] From 2fa46ec8572a9965b12d2b8e9eb6d0ac17efb0ae Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Feb 2024 09:24:57 -0800 Subject: [PATCH 116/151] Improve output of `tools/enforce_coverage.pl` (#2191) * Output package that failed; * Set thresholds `pkg/logging: 0; pkg/inspect: 60` --- tools/enforce_coverage.pl | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index b4c5c99c21..77e59c2f5b 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -16,27 +16,29 @@ use strict; use warnings; -# TODO: raise ./cmd min coverage to 80% after tests are written -my $min = 80; -my $cmdmin = 40; -my $shellmin = 0; -my $validatorsmin = 25; -my $failed_coverage = 0; - +my @failed; while (<>){ print $_; - if ( $_ =~ /hpc-toolkit\/cmd.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $cmdmin); - } elsif ( $_ =~ /hpc-toolkit\/pkg\/shell.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $shellmin); - } elsif ( $_ =~ /hpc-toolkit\/pkg\/validators.*coverage: (\d+\.\d)%/) { - $failed_coverage++ if ($1 < $validatorsmin); - } elsif ( $_ =~ /coverage: (\d+\.\d)%/ ) { - $failed_coverage++ if ($1 < $min); + + my @thresholds = qw( + cmd 40 + pkg/shell 0 + pkg/logging 0 + pkg/validators 25 + pkg/inspect 60 + pkg 80 + ); + + while (@thresholds) { + my ($path, $threshold) = splice(@thresholds, 0, 2); + if ( $_ =~ /hpc-toolkit\/$path.*coverage: (\d+\.\d)%/) { + chomp, push @failed, "$_ <= $threshold%\n" if ($1 < $threshold); + last; + } } } -if ($failed_coverage > 0) { - print STDERR "Coverage must be above $cmdmin% for ./cmd and $min% for other packages, $failed_coverage packages were below that.\n"; +if (@failed) { + print STDERR "\nFAILED:\n@failed"; exit 1 } From e9302d3e77b228b08b25f9049f8f0ef2a4a330f9 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Fri, 2 Feb 2024 18:39:32 +0000 Subject: [PATCH 117/151] Remove v4 reference from network storage document --- docs/network_storage.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/network_storage.md b/docs/network_storage.md index eaea94de93..ebec65ef8a 100644 --- a/docs/network_storage.md +++ b/docs/network_storage.md @@ -98,19 +98,19 @@ The following is an example setting up a filestore using startup script: The following matrix shows the best method by which each type of network storage device should be mounted to each mount capable module. -  | Slurm V4 | Slurm V5 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\* --- | -- | -- | -- | -- | -- | -- | -- -filestore | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -nfs-server | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -cloud-storage-bucket (GCS) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -DDN EXAScaler lustre | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS** | via STARTUP | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing -  |   |   |   |   |   |   |   -filestore (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -nfs-server (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE -DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS (pre-existing) | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development -GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing +  | Slurm V5 | Batch | vm-instance | Packer (client install) | HTCondor\* | PBS Pro\* +-- | -- | -- | -- | -- | -- | -- +filestore | via USE | via USE | via USE | via STARTUP | via USE | via USE +nfs-server | via USE | via USE | via USE | via STARTUP | via USE | via USE +cloud-storage-bucket (GCS)| via USE | via USE | via USE | via STARTUP | via USE | via USE +DDN EXAScaler lustre | via USE | via USE | via USE | Needs Testing | via USE | via USE +Intel DAOS** | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing +  |   |   |   |   |   |   +filestore (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE +nfs-server (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | via USE +DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | Needs Testing | via USE | via USE +Intel DAOS (pre-existing) | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development | Planned Development +GCS FUSE (pre-existing) | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing - **via USE:** Client installation and mounting occur automatically when connected with the use field. See From 278b2d3486e42bd6af69776bbbbcb99ccbc604f8 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Feb 2024 12:59:25 -0800 Subject: [PATCH 118/151] * Add function `Dict.Keys` to differentiate places that don't care about values. (#2194) --- pkg/config/config.go | 2 +- pkg/config/dict.go | 5 +++++ pkg/config/expand.go | 3 +-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 6e29e579ea..978ff08c60 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -335,7 +335,7 @@ func (bp Blueprint) ListUnusedVariables() []string { } unused := []string{} - for k := range bp.origVars.Items() { + for _, k := range bp.origVars.Keys() { if _, ok := used[k]; !ok { unused = append(unused, k) } diff --git a/pkg/config/dict.go b/pkg/config/dict.go index 48dac765d1..3979d79455 100644 --- a/pkg/config/dict.go +++ b/pkg/config/dict.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/zclconf/go-cty/cty" + "golang.org/x/exp/maps" ) // Dict maps string key to cty.Value. @@ -76,6 +77,10 @@ func (d *Dict) Items() map[string]cty.Value { return m } +func (d *Dict) Keys() []string { + return maps.Keys(d.m) +} + // AsObject returns Dict as cty.ObjectVal func (d *Dict) AsObject() cty.Value { return cty.ObjectVal(d.Items()) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index c69578fad4..a142df3a00 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -375,8 +375,7 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if r.GlobalVar { if !bp.Vars.Has(r.Name) { err := fmt.Errorf("module %#v references unknown global variable %#v", mod.ID, r.Name) - vars := maps.Keys(bp.Vars.Items()) - return hintSpelling(r.Name, vars, err) + return hintSpelling(r.Name, bp.Vars.Keys(), err) } return nil } From 19a82703bf2209eb4d83fd9a47c522008a8d0802 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Feb 2024 15:24:48 -0800 Subject: [PATCH 119/151] Add shorthand `Reference.AsValue()` (#2195) --- pkg/config/config_test.go | 14 +++++++------- pkg/config/dict_test.go | 2 +- pkg/config/expand.go | 9 +++------ pkg/config/expand_test.go | 8 ++++---- pkg/config/expression.go | 4 ++++ pkg/config/expression_test.go | 4 ++-- pkg/modulewriter/modulewriter_test.go | 6 +++--- pkg/validators/validators.go | 6 +++--- pkg/validators/validators_test.go | 6 +++--- 9 files changed, 30 insertions(+), 29 deletions(-) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 9cc729f9a3..f03ec3b238 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -225,7 +225,7 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { Kind: TerraformKind, Source: testModuleSource0, Settings: NewDict(map[string]cty.Value{ - altProjectIDSetting: GlobalRef("project_id").AsExpression().AsValue(), + altProjectIDSetting: GlobalRef("project_id").AsValue(), }), Outputs: []modulereader.OutputInfo{ {Name: matchingIntergroupName}, @@ -239,7 +239,7 @@ func (s *MySuite) getMultiGroupDeploymentConfig() DeploymentConfig { Source: testModuleSource1, Settings: NewDict(map[string]cty.Value{ matchingIntragroupName1: cty.StringVal("explicit-intra-value"), - matchingIntragroupName2: ModuleRef(mod0.ID, matchingIntragroupName2).AsExpression().AsValue(), + matchingIntragroupName2: ModuleRef(mod0.ID, matchingIntragroupName2).AsValue(), }), Use: ModuleIDs{mod0.ID}, } @@ -354,12 +354,12 @@ func (s *zeroSuite) TestListUnusedVariables(c *C) { }), DeploymentGroups: []DeploymentGroup{{Modules: []Module{{ Settings: NewDict(map[string]cty.Value{ - "circus": GlobalRef("pony").AsExpression().AsValue(), + "circus": GlobalRef("pony").AsValue(), }), }}}}, Validators: []Validator{{ Inputs: NewDict(map[string]cty.Value{ - "savannah": GlobalRef("zebra").AsExpression().AsValue(), + "savannah": GlobalRef("zebra").AsValue(), })}}} bp.origVars = NewDict(bp.Vars.Items()) @@ -711,7 +711,7 @@ func (s *zeroSuite) TestCheckBackends(c *C) { { // FAIL. Variable in defaults configuration b := TerraformBackend{Type: "gcs"} - b.Configuration.Set("bucket", GlobalRef("trenta").AsExpression().AsValue()) + b.Configuration.Set("bucket", GlobalRef("trenta").AsValue()) c.Check(check(b), NotNil) } @@ -721,7 +721,7 @@ func (s *zeroSuite) TestCheckBackends(c *C) { Set("bucket", cty.StringVal("trenta")). Set("complex", cty.ObjectVal(map[string]cty.Value{ "alpha": cty.StringVal("a"), - "beta": GlobalRef("boba").AsExpression().AsValue(), + "beta": GlobalRef("boba").AsValue(), })) c.Check(check(b), NotNil) } @@ -856,7 +856,7 @@ func (s *zeroSuite) TestValidateModuleSettingReference(c *C) { func (s *zeroSuite) TestValidateModuleSettingReferences(c *C) { m := Module{ID: "m"} - m.Settings.Set("white", GlobalRef("zebra").AsExpression().AsValue()) + m.Settings.Set("white", GlobalRef("zebra").AsValue()) bp := Blueprint{} p := Root.Groups.At(0).Modules.At(0) diff --git a/pkg/config/dict_test.go b/pkg/config/dict_test.go index 9626eea62e..2c6cdcde58 100644 --- a/pkg/config/dict_test.go +++ b/pkg/config/dict_test.go @@ -89,7 +89,7 @@ func TestEval(t *testing.T) { } d := NewDict(map[string]cty.Value{ "abyss": cty.ObjectVal(map[string]cty.Value{ - "white": GlobalRef("zebra").AsExpression().AsValue(), + "white": GlobalRef("zebra").AsValue(), "green": cty.StringVal("grass"), })}) want := NewDict(map[string]cty.Value{ diff --git a/pkg/config/expand.go b/pkg/config/expand.go index a142df3a00..b1748eaed8 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -217,9 +217,7 @@ func useModule(mod *Module, use Module) { continue } - v := AsProductOfModuleUse( - ModuleRef(use.ID, setting).AsExpression().AsValue(), - use.ID) + v := AsProductOfModuleUse(ModuleRef(use.ID, setting).AsValue(), use.ID) if !isList { mod.Settings.Set(setting, v) @@ -279,7 +277,7 @@ func combineModuleLabels(mod *Module, dc DeploymentConfig) { return // no op } - ref := GlobalRef(labels).AsExpression().AsValue() + ref := GlobalRef(labels).AsValue() set := mod.Settings.Get(labels) if !set.IsNull() { @@ -314,8 +312,7 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) { // If it's not set, is there a global we can use? if bp.Vars.Has(input.Name) { - ref := GlobalRef(input.Name) - mod.Settings.Set(input.Name, ref.AsExpression().AsValue()) + mod.Settings.Set(input.Name, GlobalRef(input.Name).AsValue()) continue } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 2805fe7750..94c44f9cb8 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -78,7 +78,7 @@ func (s *zeroSuite) TestUseModule(c *C) { Source: "usedSource", } varInfoNumber := modulereader.VarInfo{Name: "val1", Type: cty.Number} - ref := ModuleRef("UsedModule", "val1").AsExpression().AsValue() + ref := ModuleRef("UsedModule", "val1").AsValue() { // Pass: No Inputs, No Outputs mod := Module{ID: "lime", Source: "modSource"} @@ -237,7 +237,7 @@ func (s *MySuite) TestApplyUseModules(c *C) { m := &dc.Config.DeploymentGroups[1].Modules[0] c.Assert(m.Settings, DeepEquals, Dict{}) c.Assert(dc.applyUseModules(), IsNil) - ref := ModuleRef("TestModule0", "test_inter_0").AsExpression().AsValue() + ref := ModuleRef("TestModule0", "test_inter_0").AsValue() c.Assert(m.Settings.Items(), DeepEquals, map[string]cty.Value{ "test_inter_0": AsProductOfModuleUse(ref, "TestModule0")}) } @@ -296,7 +296,7 @@ func (s *zeroSuite) TestCombineLabels(c *C) { "ghpc_deployment": cty.StringVal("golden"), })) - labelsRef := GlobalRef("labels").AsExpression().AsValue() + labelsRef := GlobalRef("labels").AsValue() lime := dc.Config.DeploymentGroups[0] // Labels are set @@ -335,7 +335,7 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { c.Assert( mod.Settings.Get("gold"), DeepEquals, - GlobalRef("gold").AsExpression().AsValue()) + GlobalRef("gold").AsValue()) } func (s *zeroSuite) TestValidateModuleReference(c *C) { diff --git a/pkg/config/expression.go b/pkg/config/expression.go index d62539d7ac..f5aff598a0 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -56,6 +56,10 @@ func (r Reference) AsExpression() Expression { return MustParseExpression(fmt.Sprintf("module.%s.%s", r.Module, r.Name)) } +func (r Reference) AsValue() cty.Value { + return r.AsExpression().AsValue() +} + // Takes traversal in "blueprint namespace" (e.g. `vars.zone` or `homefs.mount`) // and transforms it to "terraform namespace" (e.g. `var.zone` or `module.homefs.mount`). func bpTraversalToTerraform(t hcl.Traversal) (hcl.Traversal, error) { diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index 39b0219b68..f7b488aa4c 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -190,7 +190,7 @@ func TestFlattenFunctionCallExpression(t *testing.T) { })} expr := FunctionCallExpression("flatten", cty.TupleVal([]cty.Value{ cty.TupleVal([]cty.Value{cty.NumberIntVal(1), cty.NumberIntVal(2)}), - GlobalRef("three").AsExpression().AsValue(), + GlobalRef("three").AsValue(), })) want := cty.TupleVal([]cty.Value{ @@ -218,7 +218,7 @@ func TestMergeFunctionCallExpression(t *testing.T) { "one": cty.NumberIntVal(1), "two": cty.NumberIntVal(3), }), - GlobalRef("fix").AsExpression().AsValue(), + GlobalRef("fix").AsValue(), ) want := cty.ObjectVal(map[string]cty.Value{ diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 115a1eaa6c..af4031c75d 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -466,9 +466,9 @@ func (s *MySuite) TestWriteDeploymentGroup_PackerWriter(c *C) { Kind: config.PackerKind, ID: "prince", Settings: config.NewDict(map[string]cty.Value{ - "zebra": cty.StringVal("checker"), // const - "salmon": config.GlobalRef("golf").AsExpression().AsValue(), // var - "bear": config.Reference{Module: otherMod.ID, Name: "rome"}.AsExpression().AsValue(), // IGC + "zebra": cty.StringVal("checker"), // const + "salmon": config.GlobalRef("golf").AsValue(), // var + "bear": config.Reference{Module: otherMod.ID, Name: "rome"}.AsValue(), // IGC }), } diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 57c39365cb..0b56f76660 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -150,13 +150,13 @@ func inputsAsStrings(inputs config.Dict) (map[string]string, error) { // inspect the blueprint for global variables that exist and add an appropriate validators. func defaults(bp config.Blueprint) []config.Validator { projectIDExists := bp.Vars.Has("project_id") - projectRef := config.GlobalRef("project_id").AsExpression().AsValue() + projectRef := config.GlobalRef("project_id").AsValue() regionExists := bp.Vars.Has("region") - regionRef := config.GlobalRef("region").AsExpression().AsValue() + regionRef := config.GlobalRef("region").AsValue() zoneExists := bp.Vars.Has("zone") - zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + zoneRef := config.GlobalRef("zone").AsValue() defaults := []config.Validator{ {Validator: testModuleNotUsedName}, diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go index 16d46ca519..cbceb71f04 100644 --- a/pkg/validators/validators_test.go +++ b/pkg/validators/validators_test.go @@ -75,9 +75,9 @@ func (s *MySuite) TestDefaultValidators(c *C) { unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} apisEnabled := config.Validator{Validator: "test_apis_enabled"} - projectRef := config.GlobalRef("project_id").AsExpression().AsValue() - regionRef := config.GlobalRef("region").AsExpression().AsValue() - zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + projectRef := config.GlobalRef("project_id").AsValue() + regionRef := config.GlobalRef("region").AsValue() + zoneRef := config.GlobalRef("zone").AsValue() projectExists := config.Validator{ Validator: testProjectExistsName, From 95ffab35f3de3ee38a26771d32add393e69f66eb Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Feb 2024 17:54:10 -0600 Subject: [PATCH 120/151] Deprecate Dell Omnia module and example blueprint --- community/examples/omnia-cluster.yaml | 2 + examples/README.md | 6 +- modules/README.md | 7 ++- .../cloud-build/daily-tests/builds/omnia.yaml | 55 ------------------- tools/cloud-build/daily-tests/tests/omnia.yml | 26 --------- tools/validate_configs/test_configs/README.md | 5 -- 6 files changed, 10 insertions(+), 91 deletions(-) delete mode 100644 tools/cloud-build/daily-tests/builds/omnia.yaml delete mode 100644 tools/cloud-build/daily-tests/tests/omnia.yml diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index 3437955c58..171871f60b 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -14,6 +14,8 @@ --- +# WARNING: this example has been deprecated as of v1.28.0 of the HPC Toolkit + blueprint_name: omnia-cluster vars: diff --git a/examples/README.md b/examples/README.md index 05da25a443..97cd4d8000 100644 --- a/examples/README.md +++ b/examples/README.md @@ -28,7 +28,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml-) ![community-badge] * [client-google-cloud-storage.yaml](#client-google-cloud-storageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-gromacs.yaml](#hpc-slurm-gromacsyaml--) ![community-badge] ![experimental-badge] - * [omnia-cluster.yaml](#omnia-clusteryaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-local-ssd.yaml](#hpc-slurm-local-ssdyaml--) ![community-badge] ![experimental-badge] * [hpc-gke.yaml](#hpc-gkeyaml--) ![community-badge] ![experimental-badge] * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] @@ -42,6 +41,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm-chromedesktop.yaml](#hpc-slurm-chromedesktopyaml--) ![community-badge] ![experimental-badge] * [flux-cluster](#flux-clusteryaml--) ![community-badge] ![experimental-badge] * [tutorial-fluent.yaml](#tutorial-fluentyaml--) ![community-badge] ![experimental-badge] + * [omnia-cluster.yaml](#omnia-clusteryaml---) ![community-badge] ![experimental-badge] ![deprecated-badge] * [Blueprint Schema](#blueprint-schema) * [Writing an HPC Blueprint](#writing-an-hpc-blueprint) * [Blueprint Boilerplate](#blueprint-boilerplate) @@ -787,7 +787,9 @@ node scaling study of the Lignocellulose benchmark for Gromacs. [hpc-slurm-ramble-gromacs.yaml]: ../community/examples/hpc-slurm-ramble-gromacs.yaml -### [omnia-cluster.yaml] ![community-badge] ![experimental-badge] +### [omnia-cluster.yaml] ![community-badge] ![experimental-badge] ![deprecated-badge] + +_This blueprint has been deprecated and will be removed on August 1, 2024._ Creates a simple [Dell Omnia][omnia-github] provisioned cluster with an omnia-manager node that acts as the slurm manager and 2 omnia-compute nodes on diff --git a/modules/README.md b/modules/README.md index 1d1233ad81..12ddaa4557 100644 --- a/modules/README.md +++ b/modules/README.md @@ -225,9 +225,10 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca a startup script to install HTCondor and exports a list of required APIs * **[kubernetes-operations]** ![community-badge] ![experimental-badge] : Performs pre-defined operations on Kubernetes resources. -* **[omnia-install]** ![community-badge] ![experimental-badge] : Installs Slurm - via [Dell Omnia](https://github.com/dellhpc/omnia) onto a cluster of VMs - instances. +* **[omnia-install]** ![community-badge] ![experimental-badge] ![deprecated-badge] : + Installs Slurm via [Dell Omnia](https://github.com/dellhpc/omnia) onto a + cluster of VM instances. _This module has been deprecated and will be removed + on August 1, 2024_. * **[pbspro-preinstall]** ![community-badge] ![experimental-badge] : Creates a Cloud Storage bucket with PBS Pro RPM packages for use by PBS clusters. * **[pbspro-install]** ![community-badge] ![experimental-badge] : Creates a diff --git a/tools/cloud-build/daily-tests/builds/omnia.yaml b/tools/cloud-build/daily-tests/builds/omnia.yaml deleted file mode 100644 index ac112d3daa..0000000000 --- a/tools/cloud-build/daily-tests/builds/omnia.yaml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -timeout: 14400s # 4hr -steps: -## Test simple golang build -- id: build_ghpc - waitFor: ["-"] - name: "golang:bullseye" - entrypoint: /bin/bash - args: - - -c - - | - cd /workspace - make -- id: fetch_builder - waitFor: ["-"] - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - -c - - echo "done fetching builder" - -## Test Omnia Example -- id: omnia - waitFor: ["fetch_builder", "build_ghpc"] - name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - env: - - "ANSIBLE_HOST_KEY_CHECKING=false" - - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - args: - - -c - - | - set -x -e - BUILD_ID_FULL=$BUILD_ID - BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - OMNIA_EXAMPLE=community/examples/omnia-cluster.yaml - - ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/omnia.yml" diff --git a/tools/cloud-build/daily-tests/tests/omnia.yml b/tools/cloud-build/daily-tests/tests/omnia.yml deleted file mode 100644 index dfb3d2039e..0000000000 --- a/tools/cloud-build/daily-tests/tests/omnia.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -test_name: omnia-cluster -deployment_name: "omnia-{{ build }}" -zone: us-west3-c -workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/omnia-cluster.yaml" -network: "default" -remote_node: "*omnia-manager-0" -post_deploy_tests: [] -cli_deployment_vars: - machine_type: "c2-standard-4" diff --git a/tools/validate_configs/test_configs/README.md b/tools/validate_configs/test_configs/README.md index cabaa06f23..7b18fad427 100644 --- a/tools/validate_configs/test_configs/README.md +++ b/tools/validate_configs/test_configs/README.md @@ -29,11 +29,6 @@ head-node and 2 compute vms SLURM partitions and primarily default settings. The blueprint also creates a new VPC network, a filestore instance mounted to `/home` and a workstation VM. -**omnia-cluster-simple.yaml**: Creates a SLURM cluster using -[DellHPC Omnia](https://github.com/dellhpc/omnia). The cluster is comprised of -one manager node and eight compute nodes that share a `/home` mounted filestore -instance. The pre-existing default VPC network is used. - **instance_with_startup.yaml**: Creates a simple cluster with one vm-instance and filestore using the startup-script module to setup and mount the filestore instance. From aed7395bd4969408cdb284c814da69528abbe9e5 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 2 Feb 2024 17:59:16 -0800 Subject: [PATCH 121/151] Change mode of maintenance.py so that it can be executed as description suggests --- tools/maintenance/maintenance.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/maintenance/maintenance.py diff --git a/tools/maintenance/maintenance.py b/tools/maintenance/maintenance.py old mode 100644 new mode 100755 From 7620511dacbe477c738ebd70fbce1485320bbfdf Mon Sep 17 00:00:00 2001 From: Aaron Golden Date: Sat, 3 Feb 2024 00:32:23 +0000 Subject: [PATCH 122/151] Change batch-job-base template from json to YAML Batch supports YAML job configurations now so we can use YAML everywhere instead of JSON, which will hopefully make some of the conditional syntax in the templates easier to manage. --- modules/scheduler/batch-job-template/main.tf | 4 +- .../templates/batch-job-base.json.tftpl | 49 ------------------- .../templates/batch-job-base.yaml.tftpl | 45 +++++++++++++++++ 3 files changed, 47 insertions(+), 51 deletions(-) delete mode 100644 modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl create mode 100644 modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index 50d93462ae..f4e562b9f8 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -25,7 +25,7 @@ locals { tasks_per_node = var.task_count_per_node != null ? var.task_count_per_node : (var.mpi_mode ? 1 : null) job_template_contents = templatefile( - "${path.module}/templates/batch-job-base.json.tftpl", + "${path.module}/templates/batch-job-base.yaml.tftpl", { synchronized = var.mpi_mode runnable = var.runnable @@ -40,7 +40,7 @@ locals { ) job_id = var.job_id != null ? var.job_id : var.deployment_name - job_filename = var.job_filename != null ? var.job_filename : "cloud-batch-${local.job_id}.json" + job_filename = var.job_filename != null ? var.job_filename : "cloud-batch-${local.job_id}.yaml" job_template_output_path = "${path.root}/${local.job_filename}" subnetwork_name = var.subnetwork != null ? var.subnetwork.name : "default" diff --git a/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl b/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl deleted file mode 100644 index cb30e5abbf..0000000000 --- a/modules/scheduler/batch-job-template/templates/batch-job-base.json.tftpl +++ /dev/null @@ -1,49 +0,0 @@ -{ - "taskGroups": [{ - "taskSpec": { - "runnables": [%{ if synchronized } - { - "barrier": {} - },%{ endif } - { - "script": { - "text": ${jsonencode(runnable)} - } - }%{ if synchronized }, - { - "barrier": {} - }%{ endif } - ], - "volumes":[ - %{~ for index, vol in nfs_volumes ~} - { - "nfs":{ - "server":"${vol.server_ip}", - "remote_path": "${vol.remote_mount}" - }, - %{~ if vol.mount_options != "" && vol.mount_options != null ~} - "mount_options": "${vol.mount_options}", - %{~ endif ~} - "mount_path": "${vol.local_mount}" - }%{~ if index != (length(nfs_volumes) -1) },%{ endif } - %{~ endfor ~} - ] - }, - "taskCount":${task_count},%{ if tasks_per_node != null } - "taskCountPerNode": ${tasks_per_node},%{ endif } - "requireHostsFile": ${require_hosts_file}, - "permissiveSsh": ${permissive_ssh} - }]%{ if instance_template != null }, - "allocationPolicy": { - "instances": [{ - "instanceTemplate": "${instance_template}" - }] - }%{ endif }%{ if log_policy == "CLOUD_LOGGING" }, - "logsPolicy": { - "destination": "CLOUD_LOGGING" - }%{ endif }%{ if log_policy == "PATH" }, - "logsPolicy": { - "destination": "PATH", - "logsPath": ## Add logging path here - }%{ endif } -} diff --git a/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl new file mode 100644 index 0000000000..52ed886992 --- /dev/null +++ b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl @@ -0,0 +1,45 @@ +taskGroups: + - taskSpec: + runnables: + %{~ if synchronized ~} + - barrier: + name: "wait-for-node-startup" + %{~ endif ~} + - script: + text: ${jsonencode(runnable)} + %{~ if synchronized ~} + - barrier: + name: "wait-for-workload-to-complete" + %{~ endif ~} + %{~ if length(nfs_volumes) > 0 ~} + volumes: + %{~ for index, vol in nfs_volumes ~} + - nfs: + server: "${vol.server_ip}" + remotePath: "${vol.remote_mount}" + %{~ if vol.mount_options != "" && vol.mount_options != null ~} + mountOptions: "${vol.mount_options}" + %{~ endif ~} + mountPath: "${vol.local_mount}" + %{~ endfor ~} + %{~ endif ~} + taskCount: ${task_count} + %{~ if tasks_per_node != null ~} + taskCountPerNode: ${tasks_per_node} + %{~ endif ~} + requireHostsFile: ${require_hosts_file} + permissiveSsh: ${permissive_ssh} +%{~ if instance_template != null } +allocationPolicy: + instances: + - instanceTemplate: "${instance_template}" +%{~ endif } +%{~ if log_policy == "CLOUD_LOGGING" } +logsPolicy: + destination: "CLOUD_LOGGING" +%{ endif } +%{~ if log_policy == "PATH" } +logsPolicy: + destination: "PATH" + logsPath: ## Add logging path here +%{ endif } From 9ab5770084c3378f1d0297d1d9b3297fd53bc2b8 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 3 Feb 2024 12:10:38 -0800 Subject: [PATCH 123/151] Move topological ordering of vars into separate function. (#2190) **Motivation:** To be reused in other places --- pkg/config/config.go | 54 ++++++++++++++++++++++++++------------- pkg/config/config_test.go | 15 +++++++++++ 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 978ff08c60..2f34c51848 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -671,45 +671,63 @@ func checkPackerGroups(groups []DeploymentGroup) error { return errs.OrNil() } -func (bp *Blueprint) evalVars() (Dict, error) { - // 0 - unvisited - // 1 - on stack - // 2 - done - used := map[string]int{} - res := Dict{} +func varsTopologicalOrder(vars Dict) ([]string, error) { + // 0, 1, 2 - unvisited, on stack, exited + used := map[string]int{} // default is 0 - unvisited + res := []string{} - // walk vars in reverse topological order, and evaluate them + // walk vars in reverse topological order var dfs func(string) error dfs = func(n string) error { used[n] = 1 // put on stack - v := bp.Vars.Get(n) + v := vars.Get(n) for ref, rp := range valueReferences(v) { - p := Root.Vars.Dot(n).Cty(rp) + // TODO: instead of ref.Name render as a full reference + repr, p := ref.Name, Root.Vars.Dot(n).Cty(rp) + if !ref.GlobalVar { - return BpError{p, fmt.Errorf("non-global variable %q referenced in expression", ref.Name)} + return BpError{p, fmt.Errorf("non-global variable %q referenced in expression", repr)} } + if used[ref.Name] == 1 { - return BpError{p, fmt.Errorf("cyclic dependency detected: %q -> %q", n, ref.Name)} + return BpError{p, fmt.Errorf("cyclic dependency detected: %q -> %q", n, repr)} } + if used[ref.Name] == 0 { if err := dfs(ref.Name); err != nil { return err } } } - - used[n] = 2 // remove from stack and evaluate - ev, err := evalValue(v, Blueprint{Vars: res}) - res.Set(n, ev) - return err + used[n] = 2 // remove from stack and add to result + res = append(res, n) + return nil } - for n := range bp.Vars.Items() { + for n := range vars.Items() { if used[n] == 0 { // unvisited if err := dfs(n); err != nil { - return Dict{}, err + return nil, err } } } return res, nil } + +func (bp *Blueprint) evalVars() (Dict, error) { + order, err := varsTopologicalOrder(bp.Vars) + if err != nil { + return Dict{}, err + } + + res := Dict{} + for _, n := range order { + v := bp.Vars.Get(n) + ev, err := evalValue(v, Blueprint{Vars: res}) + if err != nil { + return Dict{}, BpError{Root.Vars.Dot(n), err} + } + res.Set(n, ev) + } + return res, nil +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f03ec3b238..8e4902050d 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -940,4 +940,19 @@ func (s *zeroSuite) TestEvalVars(c *C) { c.Error(err, " should be BpError") } } + + { // Non-computable + vars := NewDict(map[string]cty.Value{ + "uro": MustParseExpression("DoesHalt(var.bo)").AsValue(), + "bo": cty.StringVal("01_10"), + }) + _, err := (&Blueprint{Vars: vars}).evalVars() + var berr BpError + if errors.As(err, &berr) { + c.Check(berr.Error(), Matches, ".*no function.*DoesHalt.*") + c.Check(berr.Path.String(), Equals, "vars.uro") + } else { + c.Error(err, " should be BpError") + } + } } From 658e0f173c3c942693daed5977d1f76af2921e9c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:16:00 +0000 Subject: [PATCH 124/151] Bump google.golang.org/api from 0.159.0 to 0.161.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.159.0 to 0.161.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.159.0...v0.161.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c5177ae140..3cb98ce4eb 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,7 @@ require ( github.com/hashicorp/terraform-exec v0.20.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.159.0 + google.golang.org/api v0.161.0 ) require ( diff --git a/go.sum b/go.sum index a4b640b803..256e294865 100644 --- a/go.sum +++ b/go.sum @@ -880,8 +880,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.159.0 h1:fVTj+7HHiUYz4JEZCHHoRIeQX7h5FMzrA2RF/DzDdbs= -google.golang.org/api v0.159.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw= +google.golang.org/api v0.161.0 h1:oYzk/bs26WN10AV7iU7MVJVXBH8oCPS2hHyBiEeFoSU= +google.golang.org/api v0.161.0/go.mod h1:0mu0TpK33qnydLvWqbImq2b1eQ5FHRSDCBzAxX9ZHyw= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= From 8ee645c78eea870a93364d3f4e6d936f5063491c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 07:37:53 -0600 Subject: [PATCH 125/151] Update Slurm-GCP release to 5.10.2 --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- .../examples/hpc-slurm-chromedesktop.yaml | 5 ++-- community/examples/hpc-slurm-local-ssd.yaml | 1 + .../examples/hpc-slurm-ramble-gromacs.yaml | 1 + community/examples/hpc-slurm-ubuntu2004.yaml | 5 ++-- community/examples/htc-slurm.yaml | 3 +- .../examples/tutorial-starccm-slurm.yaml | 1 + .../schedmd-slurm-gcp-v5-node-group/README.md | 7 +++-- .../schedmd-slurm-gcp-v5-node-group/main.tf | 1 + .../source_image_logic.tf | 12 ++++---- .../variables.tf | 16 ++++++++-- .../README.md | 6 ++-- .../main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 8 ++--- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../variables.tf | 19 ++++++------ .../schedmd-slurm-gcp-v6-nodeset/README.md | 4 +-- .../schedmd-slurm-gcp-v6-partition/README.md | 4 +-- .../schedmd-slurm-gcp-v5-controller/README.md | 30 +++++++++---------- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 +-- .../source_image_logic.tf | 12 ++++---- .../variables.tf | 9 +++--- .../schedmd-slurm-gcp-v5-hybrid/README.md | 14 ++++----- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 16 +++++----- .../schedmd-slurm-gcp-v5-login/main.tf | 4 +-- .../source_image_logic.tf | 12 ++++---- .../schedmd-slurm-gcp-v5-login/variables.tf | 4 +-- .../schedmd-slurm-gcp-v6-controller/README.md | 12 ++++---- .../schedmd-slurm-gcp-v6-login/README.md | 10 +++---- docs/gpu-support.md | 2 +- ...demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 +-- .../on-prem-instructions.md | 20 ++++++------- docs/image-building.md | 6 ++-- docs/vm-images.md | 4 +-- examples/README.md | 8 ++--- examples/cae/cae-slurm.yaml | 7 +++-- examples/hpc-enterprise-slurm.yaml | 9 ++---- examples/image-builder.yaml | 5 ++-- examples/ml-slurm.yaml | 5 ++-- modules/README.md | 4 +-- .../daily-tests/blueprints/lustre-slurm.yaml | 4 +-- .../daily-tests/tests/slurm-v5-debian.yml | 2 +- .../daily-tests/tests/slurm-v5-rocky8.yml | 2 +- .../slurm-filestore.yaml | 10 +++---- .../os_compatibility_tests/slurm-lustre.yaml | 8 ++--- .../os_compatibility_tests/slurm-startup.yaml | 10 +++---- .../test_configs/node-groups.yaml | 5 ++-- .../test_configs/slurm-static-test.yaml | 10 +++---- 51 files changed, 190 insertions(+), 167 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 4f68b4de41..3a52e74d42 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -171,7 +171,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public - id: low_cost_node_group diff --git a/community/examples/hpc-slurm-chromedesktop.yaml b/community/examples/hpc-slurm-chromedesktop.yaml index 8e6b816cb2..0e1a9c6e36 100644 --- a/community/examples/hpc-slurm-chromedesktop.yaml +++ b/community/examples/hpc-slurm-chromedesktop.yaml @@ -17,15 +17,16 @@ blueprint_name: slurm-crd vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-crd-01 region: us-central1 zone: us-central1-c instance_image_crd: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index c8b18d1f8f..e3ebcacc56 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -17,6 +17,7 @@ blueprint_name: hpc-slurm-local-ssd vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-localssd region: us-central1 diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 15e6577c95..7efb91079d 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -17,6 +17,7 @@ blueprint_name: hpc-slurm-ramble-gromacs vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-slurm-ramble-gromacs region: us-central1 diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 261376e816..ae2deeb205 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -17,14 +17,15 @@ blueprint_name: hpc-slurm-ubuntu2004 vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-gcp-v5 region: us-west4 zone: us-west4-c instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-5-9-ubuntu-2004-lts + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + family: slurm-gcp-5-10-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 554448b115..53a1afc833 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -17,12 +17,13 @@ # This blueprint provisions a cluster using the Slurm scheduler configured to # efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also: -# https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md +# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md # https://slurm.schedmd.com/high_throughput.html blueprint_name: htc-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: htc-slurm region: us-west4 diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index db18855352..e450f59d21 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -17,6 +17,7 @@ blueprint_name: starccm-on-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: starccm-slurm region: us-central1 diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 7d9387f3aa..d37f6d947a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -72,8 +72,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -136,11 +136,12 @@ No modules. | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be unset (null) or "PERIODIC". | `string` | `null` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index a382a4232a..825f3c0a4a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -58,6 +58,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type + maintenance_interval = var.maintenance_interval metadata = var.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 2a38a2e64b..bbf3848b43 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { description = "Project in which the HPC deployment will be created." @@ -96,7 +96,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { @@ -413,6 +413,18 @@ variable "additional_networks" { })) } +variable "maintenance_interval" { + description = "Specifies the frequency of planned maintenance events. Must be unset (null) or \"PERIODIC\"." + default = null + type = string + nullable = true + + validation { + condition = var.maintenance_interval == null || var.maintenance_interval == "PERIODIC" + error_message = "var.maintenance_interval must be unset (null) or set to \"PERIODIC\"" + } +} + variable "disable_public_ips" { description = "If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access_config is set." type = bool diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index de0dbdb267..51e49f42d6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -35,8 +35,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 5bf9b93c91..6483eb2e0c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index 55d82b07d1..137023ee26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index ca57874c31..ba8af335f8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -110,8 +110,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.9.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.10.2 | ## Resources @@ -164,7 +164,7 @@ limitations under the License. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | +| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
maintenance_interval = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [partition\_startup\_scripts\_timeout](#input\_partition\_startup\_scripts\_timeout) | The timeout (seconds) applied to the partition startup script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 643e4f3ac1..80f6b7a6eb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -38,7 +38,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.10.2" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 698dbd5c60..7c06a1edb5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "deployment_name" { description = "Name of the deployment." @@ -240,14 +240,15 @@ variable "node_groups" { count = number type = string }) - instance_template = string - labels = map(string) - machine_type = string - metadata = map(string) - min_cpu_platform = string - on_host_maintenance = string - preemptible = bool - reservation_name = string + instance_template = string + labels = map(string) + machine_type = string + maintenance_interval = string + metadata = map(string) + min_cpu_platform = string + on_host_maintenance = string + preemptible = bool + reservation_name = string service_account = object({ email = string scopes = list(string) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index d7f0ee21b0..ee0ab788ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -122,8 +122,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 523bf0d997..00731800cf 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -56,8 +56,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## Requirements diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 213c12975e..34e96043f6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster#optional ## Custom Images @@ -178,8 +178,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License @@ -215,8 +215,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.9.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.10.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -248,8 +248,8 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | +| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | @@ -259,7 +259,7 @@ limitations under the License. | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | @@ -271,7 +271,7 @@ limitations under the License. | [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | +| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
maintenance_interval = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 834b00240b..fbbc0c0b5c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -55,7 +55,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -92,7 +92,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 9f3c5810ed..27fac71324 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." @@ -214,7 +214,7 @@ variable "enable_cleanup_compute" { placement groups) managed by this module, when cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may impact the running workload. Deployed compute nodes may be destroyed and their jobs will be requeued. @@ -229,7 +229,7 @@ variable "enable_cleanup_subscriptions" { cluster is destroyed. NOTE: Requires Python and pip packages listed at the following link: - https://github.com/SchedMD/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt + https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior. EOD @@ -413,6 +413,7 @@ variable "partition" { enable_spot_vm = bool group_name = string instance_template = string + maintenance_interval = string node_conf = map(string) reservation_name = string spot_instance_config = object({ @@ -552,7 +553,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index e50e1baddf..8a897be889 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md -[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer ## License @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.9.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.10.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index a3d30a1f24..411cec0dd0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.10.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 09979f2320..f4d39e56a0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -46,8 +46,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2#slurm-on-google-cloud-platform ## License @@ -82,8 +82,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.9.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.9.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.10.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.10.2 | ## Resources @@ -113,7 +113,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-9-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-10-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index 214019af31..9888a764d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.10.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.9.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.10.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index 3acb583f3b..ddcb1ff6ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -18,12 +18,12 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-9-debian-11", - "slurm-gcp-5-9-hpc-rocky-linux-8", - "slurm-gcp-5-9-ubuntu-2004-lts", - "slurm-gcp-5-9-ubuntu-2204-lts-arm64", - "slurm-gcp-5-9-hpc-centos-7-k80", - "slurm-gcp-5-9-hpc-centos-7" + "slurm-gcp-5-10-debian-11", + "slurm-gcp-5-10-hpc-rocky-linux-8", + "slurm-gcp-5-10-ubuntu-2004-lts", + "slurm-gcp-5-10-ubuntu-2204-lts-arm64", + "slurm-gcp-5-10-hpc-centos-7-k80", + "slurm-gcp-5-10-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 0b43011968..709df950be 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 variable "project_id" { type = string @@ -296,7 +296,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-9-hpc-centos-7" + family = "slurm-gcp-5-10-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index fd7033b56f..230e5661af 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,11 +11,11 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/6.2.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/6.2.0/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -87,8 +87,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform ## License diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 682a4c2a68..8fad372646 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -52,8 +52,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0#slurm-on-google-cloud-platform ## Requirements diff --git a/docs/gpu-support.md b/docs/gpu-support.md index bf600f96dc..c1aa4989a3 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -132,7 +132,7 @@ information, see the SchedMD documentation: * [srun Documentation](https://slurm.schedmd.com/srun.html) * [sbatch Documentation](https://slurm.schedmd.com/sbatch.html) -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp [cloud_parameters]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_cloud_parameters ## Further Reading diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 0fa3b5595f..5bd753d2e7 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 1b3f60a354..ada2606dea 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 1ab5f94d4b..037019e887 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm\_controller\_hybrid]: https://github.com/SchedMD/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/docs/hybrid.md ### NFS Mounts @@ -224,7 +224,7 @@ image created with slurm 21.08.8: node_count_dynamic_max: 20 instance_image: project: $(vars.project_id) - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer -[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.9.1/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.10.2/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 46ce0064b1..e8ff335f8d 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -15,7 +15,7 @@ operating system and your HPC applications. A typical custom image workflow is: [images]: https://cloud.google.com/compute/docs/images [standard-os]: https://cloud.google.com/compute/docs/images/os-details -[slurm-images]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#public-image +[slurm-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#public-image ## Examples @@ -154,7 +154,7 @@ a subdirectory. > to Ansible playbooks by a relative path (`../ansible`) that will not be > downloaded. -[schedmd-packer]: https://github.com/SchedMD/slurm-gcp/tree/master/packer#readme +[schedmd-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/packer#readme For example, to address the issue noted above: @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/SchedMD/slurm-gcp//packer?ref=5.9.1&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.10.2&depth=1 kind: packer settings: use_iap: true diff --git a/docs/vm-images.md b/docs/vm-images.md index 8989c43953..89f2d87d05 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -313,8 +313,8 @@ These instructions apply to the following modules: * [schedmd-slurm-gcp-v5-login] * [schedmd-slurm-gcp-v5-node-group] -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5 -[slurm-gcp-packer]: https://github.com/SchedMD/slurm-gcp/tree/v5/packer +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 +[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5/packer [slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md [slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create diff --git a/examples/README.md b/examples/README.md index 97cd4d8000..15fc1bb50f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -605,11 +605,11 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.9.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.10.2/scripts/requirements.txt > ``` Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. -[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. +[Other operating systems] are supported by SchedMD for the the Slurm on GCP project and images are listed [here](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family). Only the examples listed in this page been tested by the Cloud HPC Toolkit team. The cluster will support 2 partitions named `debug` and `compute`. The `debug` partition is the default partition and runs on smaller @@ -618,7 +618,7 @@ specifying in the `srun` command via the `--partition` flag. The `compute` partition runs on compute optimized nodes of type `cs-standard-60`. The `compute` partition may require additional quota before using. -[Other operating systems]: https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems +[Other operating systems]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems [hpc-slurm-ubuntu2004.yaml]: ../community/examples/hpc-slurm-ubuntu2004.yaml #### Quota Requirements for hpc-slurm-ubuntu2004.yaml @@ -910,7 +910,7 @@ tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs. For more information see: -* [Slurm on Google Cloud High Throughput documentation](https://github.com/SchedMD/slurm-gcp/blob/master/docs/htc.md) +* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md) * [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html) [htc-slurm.yaml]: ../community/examples/htc-slurm.yaml diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 5da17f2777..ab641f96f6 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -28,6 +28,7 @@ # blueprint_name: cae-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: cae-slurm # check here for other regions with H3 deployments: https://cloud.google.com/compute/docs/regions-zones @@ -36,14 +37,14 @@ vars: # zone: europe-west4-b region: us-central1 zone: us-central1-a - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public crd_instance_image: - family: slurm-gcp-5-9-debian-11 # must be Debian for CRD + family: slurm-gcp-5-10-debian-11 # must be Debian for CRD project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index fefe24cc2b..fb710ee028 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -17,15 +17,16 @@ blueprint_name: hpc-enterprise-slurm vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc01 region: us-central1 zone: us-central1-a gpu_zones: [us-central1-a, us-central1-b, us-central1-c, us-central1-f] slurm_image: - # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public # Set to true for active cluster reconfiguration. # Note that setting this option requires additional dependencies to be installed locally. @@ -89,16 +90,12 @@ deployment_groups: source: modules/file-system/filestore use: [network1] settings: - filestore_tier: BASIC_SSD - size_gb: 2560 # smallest size for BASIC_SSD local_mount: /home - id: projectsfs source: modules/file-system/filestore use: [network1] settings: - filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 # smallest size for HIGH_SCALE_SSD local_mount: /projects # This file system has an associated license cost. diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 4eeae609b5..fb842e009b 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -21,6 +21,7 @@ blueprint_name: image-builder vars: + enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: image-builder-001 region: us-central1 @@ -59,8 +60,8 @@ deployment_groups: - scripts_for_image settings: source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-hpc-centos-7 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-hpc-centos-7 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index aa06aaddbb..780b4b722b 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -16,6 +16,7 @@ blueprint_name: ml-slurm vars: + enable_devel: true project_id: ## Set project id here deployment_name: ml-example region: asia-southeast1 @@ -135,8 +136,8 @@ deployment_groups: # w/o new VPC omit_external_ip: false source_image_project_id: [schedmd-slurm-public] - # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-9-debian-11 + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-5-10-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/modules/README.md b/modules/README.md index 12ddaa4557..9448af47e6 100644 --- a/modules/README.md +++ b/modules/README.md @@ -209,8 +209,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 -[slurm-gcp-version-6]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.10.2 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.2.0 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 3db7e9fa56..10d06b03f2 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-5-9-hpc-rocky-linux-8 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -85,7 +85,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-5-9-ubuntu-2004-lts + # family: slurm-gcp-5-10-ubuntu-2004-lts # project: schedmd-slurm-public # instance_image_custom: true diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml index d356e5f380..c6e271cdc6 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml @@ -21,7 +21,7 @@ deployment_name: "debi-v5-{{ build }}" slurm_cluster_name: "debiv5{{ build[0:4] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-debian-11, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml index 8f593332bf..5d26b72fdb 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml @@ -21,7 +21,7 @@ deployment_name: "rock-8-{{ build }}" slurm_cluster_name: "rock8{{ build[0:5] }}" cli_deployment_vars: - instance_image: "{family: slurm-gcp-5-9-hpc-rocky-linux-8, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-10-hpc-rocky-linux-8, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml index c56a226980..06eafcb4bd 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml index 593442b137..bab5ee5183 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-lustre.yaml @@ -24,10 +24,10 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml index 7ab7513756..6c5164de61 100644 --- a/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml +++ b/tools/validate_configs/os_compatibility_tests/slurm-startup.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 9b611679e6..28c776f4c1 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-5-9-debian-11 + family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -83,7 +83,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-5-9-hpc-centos-7 + family: slurm-gcp-5-10-hpc-centos-7 project: schedmd-slurm-public instance_image_custom: true enable_smt: true @@ -139,6 +139,7 @@ deployment_groups: instance_template: null labels: $(vars.labels) machine_type: n2-standard-16 + maintenance_interval: null metadata: {} min_cpu_platform: null on_host_maintenance: TERMINATE diff --git a/tools/validate_configs/test_configs/slurm-static-test.yaml b/tools/validate_configs/test_configs/slurm-static-test.yaml index 5d5ed3cf4c..7e3adcbb9a 100644 --- a/tools/validate_configs/test_configs/slurm-static-test.yaml +++ b/tools/validate_configs/test_configs/slurm-static-test.yaml @@ -24,11 +24,11 @@ vars: machine_type: n1-standard-2 instance_image: # Please refer to the following link for the latest images: - # https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-9-ubuntu-2004-lts - # family: slurm-gcp-5-9-hpc-centos-7 - family: slurm-gcp-5-9-hpc-rocky-linux-8 - # family: slurm-gcp-5-9-debian-11 + # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems + # family: slurm-gcp-5-10-ubuntu-2004-lts + # family: slurm-gcp-5-10-hpc-centos-7 + family: slurm-gcp-5-10-hpc-rocky-linux-8 + # family: slurm-gcp-5-10-debian-11 project: schedmd-slurm-public instance_image_custom: true enable_reconfigure: true From 529a48bac0ab7cfd9b20a08fe755664486d49a37 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 5 Feb 2024 10:03:53 -0800 Subject: [PATCH 126/151] Add expression variables to the test configs (#2197) --- tools/validate_configs/golden_copies/configs/igc_tf.yaml | 2 +- .../golden_copies/configs/merge_flatten.yaml | 5 +++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 1 + tools/validate_configs/test_configs/packer.yaml | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/validate_configs/golden_copies/configs/igc_tf.yaml b/tools/validate_configs/golden_copies/configs/igc_tf.yaml index ad50266198..fff810f9ba 100644 --- a/tools/validate_configs/golden_copies/configs/igc_tf.yaml +++ b/tools/validate_configs/golden_copies/configs/igc_tf.yaml @@ -19,7 +19,7 @@ vars: project_id: # deployment_name: igc-tf-test region: us-east4 - zone: us-east4-c + zone: $(vars.region)-c deployment_groups: - group: zero diff --git a/tools/validate_configs/golden_copies/configs/merge_flatten.yaml b/tools/validate_configs/golden_copies/configs/merge_flatten.yaml index 590510eb41..5c2a43cb6c 100644 --- a/tools/validate_configs/golden_copies/configs/merge_flatten.yaml +++ b/tools/validate_configs/golden_copies/configs/merge_flatten.yaml @@ -17,8 +17,9 @@ blueprint_name: merge_flatten vars: project_id: # deployment_name: merge_flatten - region: us-east4 - zone: us-east4-c + region_number: 4 + region: us-east$(vars.region_number) + zone: $(vars.region)-c deployment_groups: - group: zero diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 7936ce8934..66b52a3479 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -32,6 +32,7 @@ vars: ghpc_deployment: golden_copy_deployment project_id: invalid-project region: us-east4 + region_number: 4 zone: us-east4-c deployment_groups: - group: zero diff --git a/tools/validate_configs/test_configs/packer.yaml b/tools/validate_configs/test_configs/packer.yaml index 24af11c96d..02c9c125ba 100644 --- a/tools/validate_configs/test_configs/packer.yaml +++ b/tools/validate_configs/test_configs/packer.yaml @@ -20,7 +20,7 @@ vars: project_id: ## Set GCP Project ID Here ## deployment_name: hpc-slurm1 region: europe-west4 - zone: europe-west4-a + zone: $(vars.region)-a network_name: image-builder-net subnetwork_name: image-builder-primary-subnet From 23af765dd49f03ed731a2f5990cd3995fa1c3425 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 17:06:41 -0600 Subject: [PATCH 127/151] HTCondor: expire ClassAds more rapidly Decrease the default CLASSAD_LIFETIME from 15 minutes to 3 minutes. In an on-premises system, allowing for long intervals between ClassAd updates can be good to allow more machines to reboot. In the cloud, the absence of ClassAd updates more likely indicates the intentional (or automated) deletion of a VM. So it should be removed from the HTCondor pool. --- .../htcondor-central-manager/templates/condor_config.tftpl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl index d6a6d451b5..1f8089cc1b 100644 --- a/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl @@ -20,6 +20,11 @@ use role:get_htcondor_central_manager CONDOR_HOST = $(IPV4_ADDRESS) # Central Manager configuration settings +# https://htcondor.readthedocs.io/en/23.0/admin-manual/configuration-macros.html#condor-collector-configuration-file-entries +# https://htcondor.readthedocs.io/en/23.0/admin-manual/configuration-macros.html#condor-negotiator-configuration-file-entries +# set classad lifetime (expiration) to ~5x the update interval for all daemons +# defaults to 900s +CLASSAD_LIFETIME = 180 COLLECTOR_UPDATE_INTERVAL = 30 NEGOTIATOR_UPDATE_INTERVAL = 30 NEGOTIATOR_DEPTH_FIRST = True From b1de00ac94a740f0ec331af14f993f0274712604 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 17:06:42 -0600 Subject: [PATCH 128/151] HTCondor: ensure Windows nodes are detected as unhealthy Ensure that the script for Windows exits with an error before starting HTCondor when it cannot download the condor_config file. --- .../templates/download-condor-config.ps1.tftpl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl index c69d6022c6..6fa16eb8ec 100644 --- a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl +++ b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl @@ -16,6 +16,10 @@ $remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} if ($local_hash -cne $remote_hash) { Write-Output "Updating condor configuration" gcloud storage cp ${config_object} $config_file + if ($LASTEXITCODE -ne 0) + { + throw "Could not download HTCondor configuration; exiting startup script" + } Restart-Service condor } From 729b0c47c97898e7aea93aaa8fe4a83c3489f900 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 17:06:42 -0600 Subject: [PATCH 129/151] Align formatting choices with recent commits --- .../templates/download-condor-config.ps1.tftpl | 13 +++++++++---- .../templates/fetch-idtoken.ps1.tftpl | 10 +++++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl index 6fa16eb8ec..19789f122e 100644 --- a/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl +++ b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl @@ -1,19 +1,24 @@ # create directory for local condor_config customizations $config_dir = 'C:\Condor\config' -if(!(test-path -PathType container -Path $config_dir)) { +if(!(test-path -PathType container -Path $config_dir)) +{ New-Item -ItemType Directory -Path $config_dir } # update local condor_config if blueprint has changed $config_file = "$config_dir\50-ghpc-managed" -if (Test-Path -Path $config_file -PathType Leaf) { +if (Test-Path -Path $config_file -PathType Leaf) +{ $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file -} else { +} +else +{ $local_hash = "INVALID-HASH" } $remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} -if ($local_hash -cne $remote_hash) { +if ($local_hash -cne $remote_hash) +{ Write-Output "Updating condor configuration" gcloud storage cp ${config_object} $config_file if ($LASTEXITCODE -ne 0) diff --git a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl index 838d268162..04c96291ee 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl +++ b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl @@ -2,8 +2,9 @@ Set-StrictMode -Version latest $ErrorActionPreference = 'Stop' $config_dir = 'C:\Condor\config' -if(!(test-path -PathType container -Path $config_dir)) { - New-Item -ItemType Directory -Path $config_dir +if(!(test-path -PathType container -Path $config_dir)) +{ + New-Item -ItemType Directory -Path $config_dir } $config_file = "$config_dir\51-ghpc-trust-domain" @@ -19,4 +20,7 @@ Set-Content -Path "$config_file" -Value "$config_string" gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` --out-file C:\condor\tokens.d\condor@${trust_domain} -if ($LASTEXITCODE -ne 0) { throw "Could not download HTCondor IDTOKEN; exiting startup script" } +if ($LASTEXITCODE -ne 0) +{ + throw "Could not download HTCondor IDTOKEN; exiting startup script" +} From 48cf12339824cb5d4129ab287728b4d01103a428 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 5 Feb 2024 17:06:42 -0600 Subject: [PATCH 130/151] HTCondor autoscaler Adopt a more conservative approach that the autoscaler should treat nodes in any state that reflects automated MIG modification as an "idle" node for the purposes of autoscaling. This helps prevent autoscaling runaway when VMs are unable to enter the healthy state (which reflects as "NONE" for currentAction in the MIG). --- .../htcondor-install/files/autoscaler.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index 40b7612e47..06cc9be785 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -253,17 +253,14 @@ def scale(self): current_target = responseGroupInfo["targetSize"] print(f"Current MIG target size: {current_target}") - being_born_states = ["CREATING", "RECREATING", "VERIFYING"] - being_born_filters = [ f"currentAction = \"{state}\"" for state in being_born_states ] - being_born_combined_filter = ' OR '.join(being_born_filters) - reqCreatingInstances = self.instanceGroupManagers.listManagedInstances( + reqModifyingInstances = self.instanceGroupManagers.listManagedInstances( project=self.project, **self.zoneargs, instanceGroupManager=self.instance_group_manager, - filter=being_born_combined_filter, + filter="currentAction != \"NONE\"", orderBy="creationTimestamp desc" ) - respCreatingInstances = reqCreatingInstances.execute() + respModifyingInstances = reqModifyingInstances.execute() # Find VMs that are idle (no dynamic slots created from partitionable # slots) in the MIG handled by this autoscaler @@ -287,17 +284,19 @@ def scale(self): # their readiness to join pool (creating, unhealthy, healthy+idle) idle_nodes = OrderedDict() try: - creatingInstances = respCreatingInstances["managedInstances"] + modifyingInstances = respModifyingInstances["managedInstances"] except KeyError: - creatingInstances = [] + modifyingInstances = [] + + print(f"There are {len(modifyingInstances)} VMs being modified by the managed instance group") # there is potential for nodes in MIG health check "VERIFYING" state # to have already joined the pool and be running jobs - for instance in creatingInstances: + for instance in modifyingInstances: self_link = instance["instance"] node_name = self_link.rsplit("/", 1)[-1] if node_name not in claimed_nodes: - idle_nodes[self_link] = "creating" + idle_nodes[self_link] = "modifying" for ad in idle_node_ads: node = ad["Machine"].split(".")[0] @@ -311,7 +310,7 @@ def scale(self): idle_nodes[self_link] = "idle" n_idle = len(idle_nodes) - print(f"There are {n_idle} VMs being created or idle in the pool") + print(f"There are {n_idle} VMs being modified or idle in the pool") if self.debug > 1: print("Listing idle nodes:") pprint(idle_nodes) From 5a2bd2f352ab540c69c033e5b1d9c0c3d861d0c7 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 5 Feb 2024 22:29:12 -0800 Subject: [PATCH 131/151] Fix tests: look for yaml file, use image with yaml compat gcloud --- examples/serverless-batch-mpi.yaml | 3 +++ examples/serverless-batch.yaml | 4 ++-- .../test-validation/test-batch-submission.yml | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 640913aba4..95e58c612e 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -154,6 +154,9 @@ deployment_groups: machine_type: c2-standard-60 task_count: 2 mpi_mode: true + instance_image: + family: batch-centos-7-official + project: batch-custom-image - id: batch-login source: modules/scheduler/batch-login-node diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index f7777dd1c5..c459b584ee 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -52,8 +52,8 @@ deployment_groups: runnable: "cat /sw/hello.txt" machine_type: n2-standard-4 instance_image: - family: hpc-rocky-linux-8 - project: cloud-hpc-image-public + family: batch-centos-7-official + project: batch-custom-image - id: batch-login source: modules/scheduler/batch-login-node diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml index de257020c9..5e9f3ae18b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-batch-submission.yml @@ -28,7 +28,7 @@ - name: Submit batch job register: batch_submission changed_when: batch_submission.rc == 0 - ansible.builtin.command: gcloud alpha batch jobs submit {{ deployment_name }} --config=/home/batch-jobs/cloud-batch-{{ deployment_name }}.json --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} + ansible.builtin.command: gcloud alpha batch jobs submit {{ deployment_name }} --config=/home/batch-jobs/cloud-batch-{{ deployment_name }}.yaml --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} - name: Wait for job to run changed_when: false ansible.builtin.command: gcloud alpha batch jobs describe {{ deployment_name }} --location={{ cli_deployment_vars.region }} --project={{ custom_vars.project }} From f5c213d0614303b6268734893faa842b0215d127 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 5 Feb 2024 23:03:40 -0800 Subject: [PATCH 132/151] Use multiline yaml block scalar for Batch runnable --- .../batch-job-template/templates/batch-job-base.yaml.tftpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl index 52ed886992..177c2ce29f 100644 --- a/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl +++ b/modules/scheduler/batch-job-template/templates/batch-job-base.yaml.tftpl @@ -6,7 +6,7 @@ taskGroups: name: "wait-for-node-startup" %{~ endif ~} - script: - text: ${jsonencode(runnable)} + text: ${indent(12, chomp(yamlencode(runnable)))} %{~ if synchronized ~} - barrier: name: "wait-for-workload-to-complete" From 4cd0b381964e4eac66ea23b40f530bc5c7faa542 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 6 Feb 2024 08:34:54 -0600 Subject: [PATCH 133/151] Address feedback from #2204 --- community/modules/scripts/htcondor-install/files/autoscaler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index 06cc9be785..77bafa0310 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -253,6 +253,9 @@ def scale(self): current_target = responseGroupInfo["targetSize"] print(f"Current MIG target size: {current_target}") + # Find instances that are being modified by the MIG (currentAction is + # any value other than "NONE"). A common reason an instance is modified + # is it because it has failed a health check. reqModifyingInstances = self.instanceGroupManagers.listManagedInstances( project=self.project, **self.zoneargs, From fe5b441dfa95244e244b5c0c1fffbf786987bf2b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 08:59:24 -0600 Subject: [PATCH 134/151] Bump cryptography from 41.0.6 to 42.0.0 in /community/front-end/ofe Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.6 to 42.0.0. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.6...42.0.0) --- updated-dependencies: - dependency-name: cryptography dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 1510010985..23b05421ec 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -12,7 +12,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==41.0.6 +cryptography==42.0.0 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 From 9db68ae39c18d4a83a09630b7db352b2f2c6eb72 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 6 Feb 2024 11:32:26 -0800 Subject: [PATCH 135/151] Take "first deploy only" dependency on `slurm_files` (#2181) --- .../schedmd-slurm-gcp-v6-controller/README.md | 22 +++++++++---------- .../controller.tf | 13 ++++++----- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 8 +++---- .../slurm_files.tf | 2 +- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 230e5661af..f7cc534b03 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -125,17 +125,17 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | -| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.3 | -| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.3 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.3 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.3 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.3 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.3 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.3 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.3 | -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.3 | +| [cleanup\_compute\_nodes](#module\_cleanup\_compute\_nodes) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes | 6.3.4 | +| [cleanup\_resource\_policies](#module\_cleanup\_resource\_policies) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies | 6.3.4 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.3.4 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.3.4 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 6.3.4 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.3.4 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.3.4 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.3.4 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 6.3.4 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index fd57b25818..41b7a3146e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -35,7 +35,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" count = local.have_template ? 0 : 1 project_id = var.project_id @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.3.4" access_config = !var.disable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -108,8 +108,11 @@ module "slurm_controller_instance" { zone = var.zone metadata = var.metadata + labels = merge(local.labels, { + slurm_files_checksum = module.slurm_files.checksum + }) + depends_on = [ - module.slurm_files, # Ensure nodes are destroyed before controller is module.cleanup_compute_nodes[0], ] @@ -148,7 +151,7 @@ resource "google_secret_manager_secret_iam_member" "cloudsql_secret_accessor" { # Destroy all compute nodes on `terraform destroy` module "cleanup_compute_nodes" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_nodes?ref=6.3.4" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name @@ -164,7 +167,7 @@ module "cleanup_compute_nodes" { # Destroy all resource policies on `terraform destroy` module "cleanup_resource_policies" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_destroy_resource_policies?ref=6.3.4" count = var.enable_cleanup_compute ? 1 : 0 slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 6936843c90..fd5c83bf34 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" for_each = { for x in var.login_nodes : x.name_prefix => x @@ -59,7 +59,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=6.3.4" for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 7946b162d0..56e0b688dc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -21,7 +21,7 @@ locals { # NODESET module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.3.4" for_each = local.nodeset_map project_id = var.project_id @@ -60,7 +60,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.3.4" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -79,7 +79,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.3.4" for_each = local.nodeset_tpu_map project_id = var.project_id @@ -101,7 +101,7 @@ module "slurm_nodeset_tpu" { # PARTITION module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=6.3.4" for_each = local.partition_map partition_nodeset = [for x in each.value.partition_nodeset : module.slurm_nodeset[x].nodeset_name if try(module.slurm_nodeset[x], null) != null] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 7e7dabc605..15403688db 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.3" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.3.4" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name From 9b9fc820101bf3d629ad23cc5de40a73021042b3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 6 Feb 2024 11:45:15 -0800 Subject: [PATCH 136/151] Refactor `eval` functions (#2201) Make `Blueprint.Eval` the only place where blueprint context is created. --- pkg/config/config.go | 14 +++++++++----- pkg/config/dict.go | 2 +- pkg/config/expand.go | 2 +- pkg/config/expression.go | 24 ++++++++++++++---------- pkg/config/expression_test.go | 4 ++-- 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 2f34c51848..96b0b81061 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -24,6 +24,7 @@ import ( "strings" "github.com/agext/levenshtein" + "github.com/hashicorp/hcl/v2" "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "gopkg.in/yaml.v3" @@ -720,14 +721,17 @@ func (bp *Blueprint) evalVars() (Dict, error) { return Dict{}, err } - res := Dict{} + res := map[string]cty.Value{} + ctx := hcl.EvalContext{ + Variables: map[string]cty.Value{}, + Functions: functions()} for _, n := range order { - v := bp.Vars.Get(n) - ev, err := evalValue(v, Blueprint{Vars: res}) + ctx.Variables["var"] = cty.ObjectVal(res) + ev, err := eval(bp.Vars.Get(n), &ctx) if err != nil { return Dict{}, BpError{Root.Vars.Dot(n), err} } - res.Set(n, ev) + res[n] = ev } - return res, nil + return NewDict(res), nil } diff --git a/pkg/config/dict.go b/pkg/config/dict.go index 3979d79455..6b0c349ed0 100644 --- a/pkg/config/dict.go +++ b/pkg/config/dict.go @@ -97,7 +97,7 @@ func (d Dict) IsZero() bool { func (d Dict) Eval(bp Blueprint) (Dict, error) { var res Dict for k, v := range d.Items() { - r, err := evalValue(v, bp) + r, err := bp.Eval(v) if err != nil { return Dict{}, fmt.Errorf("error while trying to evaluate %#v: %w", k, err) } diff --git a/pkg/config/expand.go b/pkg/config/expand.go index b1748eaed8..7bd1bc5172 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -82,7 +82,7 @@ func validateModuleInputs(mp ModulePath, m Module, bp Blueprint) error { } func attemptEvalModuleInput(val cty.Value, bp Blueprint) (cty.Value, bool) { - v, err := evalValue(val, bp) + v, err := bp.Eval(val) // there could be a legitimate reasons for it. // e.g. use of modules output or unsupported (by ghpc) functions // TODO: diff --git a/pkg/config/expression.go b/pkg/config/expression.go index f5aff598a0..572d235048 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -144,8 +144,8 @@ func TraversalToReference(t hcl.Traversal) (Reference, error) { // Expression is a representation of expressions in Blueprint type Expression interface { - // Eval evaluates the expression in the context of Blueprint - Eval(bp Blueprint) (cty.Value, error) + // Eval evaluates the expression in the given context + Eval(ctx *hcl.EvalContext) (cty.Value, error) // Tokenize returns Tokens to be used for marshalling HCL Tokenize() hclwrite.Tokens // References return Reference for all variables used in the expression @@ -213,12 +213,8 @@ type BaseExpression struct { } // Eval evaluates the expression in the context of Blueprint -func (e BaseExpression) Eval(bp Blueprint) (cty.Value, error) { - ctx := hcl.EvalContext{ - Variables: map[string]cty.Value{"var": bp.Vars.AsObject()}, - Functions: functions(), - } - v, diag := e.e.Value(&ctx) +func (e BaseExpression) Eval(ctx *hcl.EvalContext) (cty.Value, error) { + v, diag := e.e.Value(ctx) if diag.HasErrors() { return cty.NilVal, diag } @@ -374,10 +370,18 @@ func valueReferences(v cty.Value) map[Reference]cty.Path { return r } -func evalValue(v cty.Value, bp Blueprint) (cty.Value, error) { +func (bp *Blueprint) Eval(v cty.Value) (cty.Value, error) { + ctx := hcl.EvalContext{ + Variables: map[string]cty.Value{ + "var": bp.Vars.AsObject()}, + Functions: functions()} + return eval(v, &ctx) +} + +func eval(v cty.Value, ctx *hcl.EvalContext) (cty.Value, error) { return cty.Transform(v, func(p cty.Path, v cty.Value) (cty.Value, error) { if e, is := IsExpressionValue(v); is { - return e.Eval(bp) + return e.Eval(ctx) } return v, nil }) diff --git a/pkg/config/expression_test.go b/pkg/config/expression_test.go index f7b488aa4c..15d3998d9c 100644 --- a/pkg/config/expression_test.go +++ b/pkg/config/expression_test.go @@ -198,7 +198,7 @@ func TestFlattenFunctionCallExpression(t *testing.T) { cty.NumberIntVal(2), cty.NumberIntVal(3)}) - got, err := expr.Eval(bp) + got, err := bp.Eval(expr.AsValue()) if err != nil { t.Errorf("got unexpected error: %s", err) } @@ -226,7 +226,7 @@ func TestMergeFunctionCallExpression(t *testing.T) { "two": cty.NumberIntVal(2), }) - got, err := expr.Eval(bp) + got, err := bp.Eval(expr.AsValue()) if err != nil { t.Errorf("got unexpected error: %s", err) } From 0850adf39938f62d9c97e4debae68dde74d8c019 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 6 Feb 2024 16:04:42 -0800 Subject: [PATCH 137/151] Remove `setGlobalLabels` as it's not needed (#2193) The `combineLabels` will do it. --- pkg/config/config.go | 23 +++++++++++++++++------ pkg/config/expand.go | 24 ------------------------ pkg/config/expand_test.go | 5 ----- 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 96b0b81061..ffff6bf26f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -261,7 +261,6 @@ type DeploymentConfig struct { // ExpandConfig expands the yaml config in place func (dc *DeploymentConfig) ExpandConfig() error { dc.Config.origVars = NewDict(dc.Config.Vars.Items()) // copy - dc.Config.setGlobalLabels() dc.Config.addKindToModules() if vars, err := dc.Config.evalVars(); err != nil { @@ -270,17 +269,29 @@ func (dc *DeploymentConfig) ExpandConfig() error { dc.Config.Vars = vars } + dc.expandBackends() + dc.combineLabels() + if err := validateBlueprint(dc.Config); err != nil { return err } - return dc.expand() -} + if err := dc.applyUseModules(); err != nil { + return err + } + + dc.applyGlobalVariables() -func (bp *Blueprint) setGlobalLabels() { - if !bp.Vars.Has("labels") { - bp.Vars.Set("labels", cty.EmptyObjectVal) + if err := validateInputsAllModules(dc.Config); err != nil { + return err } + + if err := validateModulesAreUsed(dc.Config); err != nil { + return err + } + + dc.Config.populateOutputs() + return nil } // ListUnusedModules provides a list modules that are in the diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 7bd1bc5172..261159b9ec 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -31,30 +31,6 @@ const ( deploymentLabel string = "ghpc_deployment" ) -// expand expands variables and strings in the yaml config. Used directly by -// ExpandConfig for the create and expand commands. -func (dc *DeploymentConfig) expand() error { - dc.expandBackends() - dc.combineLabels() - - if err := dc.applyUseModules(); err != nil { - return err - } - - dc.applyGlobalVariables() - - if err := validateInputsAllModules(dc.Config); err != nil { - return err - } - - if err := validateModulesAreUsed(dc.Config); err != nil { - return err - } - - dc.Config.populateOutputs() - return nil -} - func validateInputsAllModules(bp Blueprint) error { errs := Errors{} bp.WalkModulesSafe(func(p ModulePath, m *Module) { diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 94c44f9cb8..c27c0ed39f 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -23,11 +23,6 @@ import ( . "gopkg.in/check.v1" ) -func (s *MySuite) TestExpand(c *C) { - dc := s.getDeploymentConfigForTest() - c.Check(dc.expand(), IsNil) -} - func (s *MySuite) TestExpandBackends(c *C) { dc := s.getDeploymentConfigForTest() deplName := dc.Config.Vars.Get("deployment_name").AsString() From d4c4365c4e31bff822790d430dfde3cf1e671fed Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 7 Feb 2024 09:31:22 -0800 Subject: [PATCH 138/151] Show hint message if unsupported function is used (#2211) Currently we have a few contexts that got evaluated by ghpc: * `vars`; * `module.settings` in packer groups; * `validators.input` Our evaluation context only supports 2 functions: `merge` and `flatten`. The rest of expressions (`module.settings` in TF groups) are not evaluated by ghpc => can use any valid HCL-syntax. --- pkg/config/config_test.go | 2 +- pkg/config/expression.go | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 8e4902050d..f582b50234 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -949,7 +949,7 @@ func (s *zeroSuite) TestEvalVars(c *C) { _, err := (&Blueprint{Vars: vars}).evalVars() var berr BpError if errors.As(err, &berr) { - c.Check(berr.Error(), Matches, ".*no function.*DoesHalt.*") + c.Check(berr.Error(), Matches, ".*unsupported function.*DoesHalt.*") c.Check(berr.Path.String(), Equals, "vars.uro") } else { c.Error(err, " should be BpError") diff --git a/pkg/config/expression.go b/pkg/config/expression.go index 572d235048..dda754b563 100644 --- a/pkg/config/expression.go +++ b/pkg/config/expression.go @@ -27,6 +27,7 @@ import ( "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/function" "github.com/zclconf/go-cty/cty/function/stdlib" + "golang.org/x/exp/maps" ) // Reference is data struct that represents a reference to a variable. @@ -212,13 +213,25 @@ type BaseExpression struct { rs []Reference } +func handleEvalErr(diag hcl.Diagnostics) error { + if !diag.HasErrors() { + return nil + } + err := diag.Errs()[0] + if match := regexp.MustCompile(`There is no function named "(\w+)"`).FindStringSubmatch(err.Error()); match != nil { + sf := strings.Join(maps.Keys(functions()), ", ") + return HintError{ + Err: fmt.Errorf("unsupported function %q", match[1]), + Hint: fmt.Sprintf("this context only supports following functions: %v", sf)} + } + return err + +} + // Eval evaluates the expression in the context of Blueprint func (e BaseExpression) Eval(ctx *hcl.EvalContext) (cty.Value, error) { v, diag := e.e.Value(ctx) - if diag.HasErrors() { - return cty.NilVal, diag - } - return v, nil + return v, handleEvalErr(diag) } // Tokenize returns Tokens to be used for marshalling HCL From 15425a4d7867af12061d68e9f85a83739fefa3f8 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 13:04:41 -0600 Subject: [PATCH 139/151] Update pre-commit hooks --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 006143f19e..523e20fd6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ --- repos: - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.82.0 + rev: v1.86.0 hooks: - id: terraform_fmt - id: terraform_tflint @@ -82,16 +82,16 @@ repos: # hooks: # - id: go-critic - repo: https://github.com/Bahjat/pre-commit-golang - rev: v1.0.2 + rev: v1.0.3 hooks: - id: go-static-check - repo: https://github.com/adrienverge/yamllint - rev: v1.32.0 + rev: v1.34.0 hooks: - id: yamllint args: [-c=.yamllint, --no-warnings] - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.12 + rev: v0.9.17 hooks: - id: pymarkdown # Rules at https://github.com/jackdewinter/pymarkdown/tree/main/docs/rules @@ -111,7 +111,7 @@ repos: - id: shfmt exclude: ".*tpl" - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: end-of-file-fixer - repo: https://github.com/codespell-project/codespell From 77ab37c782e9a503fdd3185fac24e5d957d5fb35 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 13:09:45 -0600 Subject: [PATCH 140/151] Restrict GitHub actions to operate on upstream - the dependency license and PR label actions only need to run on the GoogleCloudPlatform copy of the HPC Toolkit --- .github/workflows/dependency-review.yml | 1 + .github/workflows/pr-label-validation.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index a400f5cd0d..f0f99a2665 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,6 +26,7 @@ permissions: jobs: dependency-review: + if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' runs-on: ubuntu-latest steps: - name: 'Checkout Repository' diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index 3d7667c615..dda59b7161 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -32,6 +32,7 @@ on: jobs: pr-label-validation: + if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' runs-on: ubuntu-latest permissions: pull-requests: read From 26c9b9f54325a380d26f0ebd16751d31c40524ca Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 14:17:41 -0600 Subject: [PATCH 141/151] Remove pre-commit from Cloud Build PR validation --- .../hpc-toolkit-pr-validation.yaml | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/tools/cloud-build/hpc-toolkit-pr-validation.yaml b/tools/cloud-build/hpc-toolkit-pr-validation.yaml index ed49ae4b9c..78ce66cc7f 100644 --- a/tools/cloud-build/hpc-toolkit-pr-validation.yaml +++ b/tools/cloud-build/hpc-toolkit-pr-validation.yaml @@ -18,28 +18,8 @@ steps: - id: git-fetch-unshallow name: gcr.io/cloud-builders/git args: ['fetch', '--unshallow'] -- id: pre-commits-setup - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - | - set -e - pre-commit install --install-hooks - time tflint --init -- id: pre-commit-run - waitFor: - - pre-commits-setup - name: >- - us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder - entrypoint: /bin/bash - args: - - '-c' - - SKIP=go-unit-tests pre-commit run --all-files - id: make-tests waitFor: - - pre-commits-setup - git-fetch-unshallow name: >- us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder From 3e61f296c91c8562ecaa52c2e60638542ccf32b2 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 14:17:41 -0600 Subject: [PATCH 142/151] Create GitHub Action to run pre-commit - pre-commit verification will run on every Pull Request - if the user opts in with the label "pre-commit-autofix" the user can request that pre-commit add a commit that fixes formatting, where it is capable of automatically fixing formatting --- .github/workflows/pr-precommit.yml | 55 ++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/pr-precommit.yml diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml new file mode 100644 index 0000000000..23ffc30cf7 --- /dev/null +++ b/.github/workflows/pr-precommit.yml @@ -0,0 +1,55 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +name: 'Use pre-commit to validate Pull Request' + +# yamllint disable-line rule:truthy +on: + pull_request: + types: + - edited + - opened + - labeled + - synchronize + branches: + - develop + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + check-latest: true + cache: 'pip' + - uses: actions/setup-go@v5 + with: + go-version: '1.22' + check-latest: true + - run: make install-dev-deps + - uses: terraform-linters/setup-tflint@v4 + with: + tflint_version: v0.49.0 + - uses: pre-commit/action@v3.0.1 + - uses: pre-commit-ci/lite-action@v1.0.2 + # this if statement looks funny but it ensures that this step runs + # only if: user has applied "pre-commit-autofix" label + # even if: job has failed + # not if: job is canceled + if: | + (success() || failure()) && + contains(github.event.pull_request.labels.*.name, 'pre-commit-autofix') From f1b8a572c02a3ebea1e2ed125998240781032fcf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:29:47 -0800 Subject: [PATCH 143/151] Bump django from 4.2.7 to 4.2.10 in /community/front-end/ofe (#2213) Bumps [django](https://github.com/django/django) from 4.2.7 to 4.2.10. - [Commits](https://github.com/django/django/compare/4.2.7...4.2.10) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 23b05421ec..c367dfb185 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.7 +Django==4.2.10 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.14.0 From 88574e3616dce6bb6ac86aded54e14ed03499e0c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 7 Feb 2024 13:38:01 -0800 Subject: [PATCH 144/151] Don't run `destroy_resource_policies` before `destroy_nodes` is done (#2217) ```sh module.slurm_controller.module.cleanup_compute_nodes[0].null_resource.destroy_nodes_on_destroy[0]: Destruction complete after 2m52s module.slurm_controller.module.cleanup_resource_policies[0].null_resource.destroy_resource_policies_on_destroy[0]: Destroying... [id=89627024760583747 65] ``` --- .../schedmd-slurm-gcp-v6-controller/controller.tf | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 41b7a3146e..09061581e1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -158,10 +158,15 @@ module "cleanup_compute_nodes" { project_id = var.project_id when_destroy = true - # Depend on controller network, as a best effort to avoid - # subnetwork resourceInUseByAnotherResource error - # NOTE: Can not use nodeset subnetworks as "A static list expression is required" - depends_on = [var.subnetwork_self_link] + + depends_on = [ + # Depend on controller network, as a best effort to avoid + # subnetwork resourceInUseByAnotherResource error + # NOTE: Can not use nodeset subnetworks as "A static list expression is required" + var.subnetwork_self_link, + # Ensure VMs are destroyed before resource policies + module.cleanup_resource_policies[0], + ] } From 9c92d1b23882da74a560faab69e5e65d70225a18 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 16:17:52 -0600 Subject: [PATCH 145/151] Rename HTC Slurm configuration templates with explicit purpose --- community/examples/htc-slurm.yaml | 4 ++-- .../etc/{slurm.conf.tpl => htc-slurm.conf.tpl} | 0 .../etc/{slurmdbd.conf.tpl => htc-slurmdbd.conf.tpl} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/{slurm.conf.tpl => htc-slurm.conf.tpl} (100%) rename community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/{slurmdbd.conf.tpl => htc-slurmdbd.conf.tpl} (100%) diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 53a1afc833..814f76dd9a 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -147,8 +147,8 @@ deployment_groups: settings: machine_type: c2-standard-8 disable_controller_public_ips: $(vars.disable_public_ips) - slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl - slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl + slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl + slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl similarity index 100% rename from community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurm.conf.tpl rename to community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl similarity index 100% rename from community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/slurmdbd.conf.tpl rename to community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl From fea94d42b93d28622a12b318c3f7d18829547954 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 7 Feb 2024 16:17:52 -0600 Subject: [PATCH 146/151] Add Slurm configuration template for long Prolog/Epilog scripts --- .../etc/long-prolog-slurm.conf.tpl | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl new file mode 100644 index 0000000000..5ae4184db3 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl @@ -0,0 +1,70 @@ +# slurm.conf +# https://slurm.schedmd.com/slurm.conf.html +# https://slurm.schedmd.com/configurator.html + +ProctrackType=proctrack/cgroup +SlurmctldPidFile=/var/run/slurm/slurmctld.pid +SlurmdPidFile=/var/run/slurm/slurmd.pid +TaskPlugin=task/affinity,task/cgroup +MaxNodeCount=64000 + +# +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +# +# +# LOGGING AND ACCOUNTING +AccountingStoreFlags=job_comment +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/cgroup +SlurmctldDebug=info +SlurmdDebug=info +DebugFlags=Power + +# +# +# TIMERS +MessageTimeout=60 +BatchStartTimeout=600 +PrologEpilogTimeout=600 +PrologFlags=Contain + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +SlurmctldHost={control_host}({control_addr}) + +AuthType=auth/munge +AuthInfo=cred_expire=120 +AuthAltTypes=auth/jwt +CredType=cred/munge +MpiDefault={mpi_default} +ReturnToService=2 +SlurmctldPort={control_host_port} +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +StateSaveLocation={state_save} + +# +# +# LOGGING AND ACCOUNTING +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={control_host} +ClusterName={name} +SlurmctldLogFile={slurmlog}/slurmctld.log +SlurmdLogFile={slurmlog}/slurmd-%n.log + +# +# +# GENERATED CLOUD CONFIGURATIONS +include cloud.conf + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ From c5eea26effe55756f658f913886f4b6e71e89ac1 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 8 Feb 2024 14:16:02 -0600 Subject: [PATCH 147/151] Adopt empty string as default value for maintenance_interval The default value of null cannot be set as a deployment variable; this will allow the value to be set at the top of a blueprint. --- .../compute/schedmd-slurm-gcp-v5-node-group/README.md | 2 +- .../schedmd-slurm-gcp-v5-node-group/variables.tf | 10 +++++----- tools/validate_configs/test_configs/node-groups.yaml | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index d37f6d947a..29b04cf17f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -141,7 +141,7 @@ No modules. | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | -| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be unset (null) or "PERIODIC". | `string` | `null` | no | +| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be "PERIODIC" or empty string to not use this feature. | `string` | `""` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index bbf3848b43..04fc1900f6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -414,14 +414,14 @@ variable "additional_networks" { } variable "maintenance_interval" { - description = "Specifies the frequency of planned maintenance events. Must be unset (null) or \"PERIODIC\"." - default = null + description = "Specifies the frequency of planned maintenance events. Must be \"PERIODIC\" or empty string to not use this feature." + default = "" type = string - nullable = true + nullable = false validation { - condition = var.maintenance_interval == null || var.maintenance_interval == "PERIODIC" - error_message = "var.maintenance_interval must be unset (null) or set to \"PERIODIC\"" + condition = contains(["", "PERIODIC"], var.maintenance_interval) + error_message = "var.maintenance_interval must be the empty string or \"PERIODIC\"" } } diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 28c776f4c1..026457c949 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -139,7 +139,7 @@ deployment_groups: instance_template: null labels: $(vars.labels) machine_type: n2-standard-16 - maintenance_interval: null + maintenance_interval: "" metadata: {} min_cpu_platform: null on_host_maintenance: TERMINATE From 7d749a12247d5ab12d13442b07e63fad9c5578f6 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 9 Feb 2024 16:17:45 -0600 Subject: [PATCH 148/151] Remove enable_devel from slurm-gcp v5 examples --- community/examples/hpc-slurm-chromedesktop.yaml | 1 - community/examples/hpc-slurm-local-ssd.yaml | 1 - community/examples/hpc-slurm-ramble-gromacs.yaml | 1 - community/examples/hpc-slurm-ubuntu2004.yaml | 1 - community/examples/htc-slurm.yaml | 1 - community/examples/tutorial-starccm-slurm.yaml | 1 - examples/cae/cae-slurm.yaml | 1 - examples/hpc-enterprise-slurm.yaml | 1 - examples/image-builder.yaml | 1 - examples/ml-slurm.yaml | 1 - 10 files changed, 10 deletions(-) diff --git a/community/examples/hpc-slurm-chromedesktop.yaml b/community/examples/hpc-slurm-chromedesktop.yaml index 0e1a9c6e36..c6a31d9337 100644 --- a/community/examples/hpc-slurm-chromedesktop.yaml +++ b/community/examples/hpc-slurm-chromedesktop.yaml @@ -17,7 +17,6 @@ blueprint_name: slurm-crd vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-crd-01 region: us-central1 diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index e3ebcacc56..c8b18d1f8f 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -17,7 +17,6 @@ blueprint_name: hpc-slurm-local-ssd vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-localssd region: us-central1 diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 7efb91079d..15e6577c95 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -17,7 +17,6 @@ blueprint_name: hpc-slurm-ramble-gromacs vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc-slurm-ramble-gromacs region: us-central1 diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index ae2deeb205..637a167602 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -17,7 +17,6 @@ blueprint_name: hpc-slurm-ubuntu2004 vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: slurm-gcp-v5 region: us-west4 diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 814f76dd9a..25abc6c1f0 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -23,7 +23,6 @@ blueprint_name: htc-slurm vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: htc-slurm region: us-west4 diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index e450f59d21..db18855352 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -17,7 +17,6 @@ blueprint_name: starccm-on-slurm vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: starccm-slurm region: us-central1 diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index ab641f96f6..7d2f6ee2cf 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -28,7 +28,6 @@ # blueprint_name: cae-slurm vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: cae-slurm # check here for other regions with H3 deployments: https://cloud.google.com/compute/docs/regions-zones diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index fb710ee028..57acb9cf8c 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -17,7 +17,6 @@ blueprint_name: hpc-enterprise-slurm vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: hpc01 region: us-central1 diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index fb842e009b..3a11e001b3 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -21,7 +21,6 @@ blueprint_name: image-builder vars: - enable_devel: true project_id: ## Set GCP Project ID Here ## deployment_name: image-builder-001 region: us-central1 diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 780b4b722b..00f77b0afb 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -16,7 +16,6 @@ blueprint_name: ml-slurm vars: - enable_devel: true project_id: ## Set project id here deployment_name: ml-example region: asia-southeast1 From bbbb2e38dffa837e3dd73db00d91f89fcbd7a347 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Sat, 10 Feb 2024 00:40:25 +0000 Subject: [PATCH 149/151] Add login node to spack gromacs tutorial example --- docs/tutorials/gromacs/spack-gromacs.md | 23 ++++++++++++----------- docs/tutorials/gromacs/spack-gromacs.yaml | 15 ++++++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index ce8400e1e5..67bd157926 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -84,6 +84,7 @@ This file describes the cluster you will deploy. It defines: * sets up a Spack environment including downloading an example input deck * places a submission script on a shared drive * a Slurm cluster + * a Slurm login node * a Slurm controller * An auto-scaling Slurm partition @@ -138,21 +139,21 @@ the final output from the above command: Optionally while you wait, you can see your deployed VMs on Google Cloud Console. Open the link below in a new window. Look for -`spackgroma-controller`. If you don't +`spackgroma-controller` and `spackgroma-login-login-001`. If you don't see your VMs make sure you have the correct project selected (top left). ```text https://console.cloud.google.com/compute?project= ``` -## Connecting to the controller node +## Connecting to the login node -Once the startup script has completed, connect to the controller node. +Once the startup script has completed, connect to the login node. -Use the following command to ssh into the controller node from cloud shell: +Use the following command to ssh into the login node from cloud shell: ```bash -gcloud compute ssh spackgroma-controller --zone us-central1-c --project +gcloud compute ssh spackgroma-login-login-001 --zone us-central1-c --project ``` You may be prompted to set up SSH. If so follow the prompts and if asked for a @@ -176,15 +177,15 @@ following instructions: https://console.cloud.google.com/compute?project= ``` -1. Click on the `SSH` button associated with the `spackgroma-controller` +1. Click on the `SSH` button associated with the `spackgroma-login-login-001` instance. This will open a separate pop up window with a terminal into our newly - created Slurm controller VM. + created Slurm login VM. ## Run a Job on the Cluster - **The commands below should be run on the Slurm controller node.** + **The commands below should be run on the Slurm login node.** We will use the submission script (see line 122 of the blueprint) to submit a Gromacs job. @@ -233,7 +234,7 @@ Several files will have been generated in the `test_run/` folder you created. The `md.log` and `slurm-1.out` files have information on the run such as performance. You can view these files by running the following commands on the -controller node: +login node: ```bash cat slurm-*.out @@ -258,9 +259,9 @@ https://console.cloud.google.com/monitoring/dashboards?project= **_NOTE:_** If you are accessing the controller node terminal via a separate pop-up +> **_NOTE:_** If you are accessing the login node terminal via a separate pop-up > then make sure to call `exit` in the pop-up window. ```bash diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index 285443c0b8..014a0d5d9b 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -103,7 +103,7 @@ deployment_groups: spack install fi - - id: controller-setup + - id: login-setup source: modules/scripts/startup-script settings: runners: @@ -121,6 +121,7 @@ deployment_groups: #!/bin/bash source /opt/apps/spack/share/spack/setup-env.sh spack env activate gromacs + mkdir -p /opt/apps/gromacs cd /opt/apps/gromacs wget --no-verbose https://ftp.gromacs.org/pub/benchmarks/water_GMX50_bare.tar.gz tar xzf water_GMX50_bare.tar.gz @@ -158,13 +159,21 @@ deployment_groups: use: [compute_nodeset] settings: partition_name: compute + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network1] + settings: + name_prefix: login - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - compute_partition + - slurm_login settings: disable_controller_public_ips: false - controller_startup_scripts_timeout: 21600 - controller_startup_script: $(controller-setup.startup_script) + login_startup_scripts_timeout: 21600 + login_startup_script: $(login-setup.startup_script) From 306702c5dc47e3f528d26e63772cfa1b58720a5d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 12 Feb 2024 13:21:40 -0800 Subject: [PATCH 150/151] Version bump to 1.28.0 (#2232) --- cmd/root.go | 2 +- community/modules/compute/gke-node-pool/versions.tf | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- .../modules/file-system/gke-persistent-volume/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/gke-cluster/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- community/modules/scheduler/htcondor-setup/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 33 files changed, 38 insertions(+), 38 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 224776184a..cdc1253ce6 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.27.0", + Version: "v1.28.0", Annotations: annotation, } ) diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index d1d4b272bd..4ed2544640 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.28.0" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index e395fa31c8..858518525c 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.28.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 06f4753f02..19d0bbf0e8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.28.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index ad83afa3de..301a63cc4b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.28.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index ff69b26230..43152e7c62 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.28.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 1c02561ddc..2477998535 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.28.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index b671307427..b4b652aca8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.28.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 4c9c03e98b..3535356df8 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index cdecd164d2..25acceb7ba 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.28.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 0ed9c0ee1e..e6e289d163 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.28.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 65282a9121..ac300565fb 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.28.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 48dafa2ba3..c67ccf8ec0 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.0" } } diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 883632dcb2..da6e2c4e6c 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.28.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index d76add2595..09d63000d5 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 2620a02775..d5e5879c67 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.28.0" } } diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 010cd59822..e9ac5ee7f4 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.28.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index de3bf85848..afbe17a41c 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.28.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index b3e74b2579..faf5e2127a 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.28.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 93df6ec656..686d84990a 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.28.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 1e13bdbd36..2d4efcd7a8 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.28.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 6aaf32455e..3a48d4652c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.28.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 20b0a31fdf..cd04e4b7c9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.28.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 94be8259e1..465a514c1c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.28.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index a09f51df99..b82432d3a1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.28.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 7de489f570..ff952f4950 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.28.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 9f5cc3a6a9..e0d9d365ea 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.28.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 0efb9dc3c4..a77fde0823 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.0" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index a026f56db5..8eb7e8c989 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 169c30796b..79c82e8ada 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.28.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index fd9da26b42..abbc2e6027 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.28.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 6844e61f9c..cbb5d77b5d 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.28.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 42379ee359..594b4b6a6d 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.27.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.28.0" } required_version = ">= 0.14.0" From 78c580216f70a733fcf4d5675d7e4adf5dfe0a32 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 15 Feb 2024 16:30:34 -0600 Subject: [PATCH 151/151] Bump version to v1.28.1 --- cmd/root.go | 2 +- community/modules/compute/gke-node-pool/versions.tf | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- .../modules/file-system/gke-persistent-volume/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/gke-cluster/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- community/modules/scheduler/htcondor-setup/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 33 files changed, 38 insertions(+), 38 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index cdc1253ce6..ba077f2750 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.28.0", + Version: "v1.28.1", Annotations: annotation, } ) diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index 4ed2544640..71c57be776 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.28.1" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 858518525c..101d0a0830 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.28.1" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 19d0bbf0e8..4f366c4b9c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 301a63cc4b..a899a36b26 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.28.1" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 43152e7c62..9aeab93394 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.28.1" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 2477998535..1ff5728890 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.28.1" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index b4b652aca8..4fb6264eb6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.28.1" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 3535356df8..fd1524e946 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index 25acceb7ba..8526f45f39 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.28.1" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index e6e289d163..32acf35767 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.28.1" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index ac300565fb..c5c8fd51fc 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index c67ccf8ec0..9e8f6bc869 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.28.1" } } diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index da6e2c4e6c..f3eae8670e 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index 09d63000d5..798ed45617 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.28.1" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index d5e5879c67..b6b090b6f1 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.28.1" } } diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index e9ac5ee7f4..9049a6e4b4 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.28.1" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index afbe17a41c..94fc3499dc 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.28.1" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index faf5e2127a..9245cd3c5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.28.1" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 686d84990a..ba70633daa 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 2d4efcd7a8..1b19a3f751 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.28.1" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 3a48d4652c..f915ec9723 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index cd04e4b7c9..06cfaa93ae 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.28.1" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 465a514c1c..59f3cb746d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.28.1" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index b82432d3a1..f1e679c49e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.28.1" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index ff952f4950..1b227ea480 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.28.1" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index e0d9d365ea..2e0fd50b80 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index a77fde0823..7695d3f93a 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.28.1" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 8eb7e8c989..eaee1766ff 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.1" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 79c82e8ada..8d0af26510 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index abbc2e6027..f211424223 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index cbb5d77b5d..8859e492a0 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.28.1" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 594b4b6a6d..eb900c7936 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.28.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.28.1" } required_version = ">= 0.14.0"