diff --git a/.gitignore b/.gitignore index 5f333ceec8..29937edb09 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ # Built Binary ghpc +# Expand artifact +expanded.yaml # macOS Desktop Services Store .DS_Store # workspace level vscode settings diff --git a/cmd/create.go b/cmd/create.go index 4f1afd7b6f..648b1dde1e 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -100,7 +100,7 @@ func printAdvancedInstructionsMessage(deplDir string) { func expandOrDie(path string) config.DeploymentConfig { dc, ctx, err := config.NewDeploymentConfig(path) if err != nil { - log.Fatal(err) + log.Fatal(renderError(err, ctx)) } // Set properties from CLI if err := setCLIVariables(&dc.Config, cliVariables); err != nil { @@ -193,11 +193,23 @@ func renderError(err error, ctx config.YamlCtx) string { } func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string { + line := pos.Line - 1 + if line < 0 { + line = 0 + } + if line >= len(ctx.Lines) { + line = len(ctx.Lines) - 1 + } + pref := fmt.Sprintf("%d: ", pos.Line) - arrow := strings.Repeat(" ", len(pref)+pos.Column-1) + "^" + arrow := " " + if pos.Column > 0 { + spaces := strings.Repeat(" ", len(pref)+pos.Column-1) + arrow = spaces + "^" + } return fmt.Sprintf(`Error: %s %s%s -%s`, err, pref, ctx.Lines[pos.Line-1], arrow) +%s`, err, pref, ctx.Lines[line], arrow) } func setCLIVariables(bp *config.Blueprint, s []string) error { diff --git a/cmd/create_test.go b/cmd/create_test.go index b1a2e02e06..a5127b1de6 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -137,14 +137,14 @@ func (s *MySuite) TestRenderError(c *C) { c.Check(got, Equals, "arbuz") } { // has pos, but context doesn't contain it - ctx := config.NewYamlCtx([]byte(``)) + ctx, _ := config.NewYamlCtx([]byte(``)) pth := config.Root.Vars.Dot("kale") err := config.BpError{Path: pth, Err: errors.New("arbuz")} got := renderError(err, ctx) c.Check(got, Equals, "arbuz") } { // has pos, has context - ctx := config.NewYamlCtx([]byte(` + ctx, _ := config.NewYamlCtx([]byte(` vars: kale: dos`)) pth := config.Root.Vars.Dot("kale") @@ -152,7 +152,7 @@ vars: got := renderError(err, ctx) c.Check(got, Equals, `Error: arbuz 3: kale: dos - ^`) + ^`) } } @@ -161,5 +161,6 @@ func (s *MySuite) TestValidateMaybeDie(c *C) { Validators: []config.Validator{{Validator: "invalid"}}, ValidationLevel: config.ValidationWarning, } - validateMaybeDie(bp, config.NewYamlCtx([]byte{})) // smoke test + ctx, _ := config.NewYamlCtx([]byte{}) + validateMaybeDie(bp, ctx) // smoke test } diff --git a/cmd/deploy.go b/cmd/deploy.go index 5f00e1d245..4894ff85bd 100644 --- a/cmd/deploy.go +++ b/cmd/deploy.go @@ -77,9 +77,11 @@ func runDeployCmd(cmd *cobra.Command, args []string) { expandedBlueprintFile := filepath.Join(artifactsDir, expandedBlueprintFilename) dc, _, err := config.NewDeploymentConfig(expandedBlueprintFile) cobra.CheckErr(err) - cobra.CheckErr(shell.ValidateDeploymentDirectory(dc.Config.DeploymentGroups, deploymentRoot)) + groups := dc.Config.DeploymentGroups + cobra.CheckErr(validateRuntimeDependencies(groups)) + cobra.CheckErr(shell.ValidateDeploymentDirectory(groups, deploymentRoot)) - for _, group := range dc.Config.DeploymentGroups { + for _, group := range groups { groupDir := filepath.Join(deploymentRoot, string(group.Name)) cobra.CheckErr(shell.ImportInputs(groupDir, artifactsDir, expandedBlueprintFile)) @@ -102,6 +104,25 @@ func runDeployCmd(cmd *cobra.Command, args []string) { printAdvancedInstructionsMessage(deploymentRoot) } +func validateRuntimeDependencies(groups []config.DeploymentGroup) error { + for _, group := range groups { + var err error + switch group.Kind() { + case config.PackerKind: + err = shell.ConfigurePacker() + case config.TerraformKind: + groupDir := filepath.Join(deploymentRoot, string(group.Name)) + _, err = shell.ConfigureTerraform(groupDir) + default: + err = fmt.Errorf("group %s is an unsupported kind %q", group.Name, group.Kind().String()) + } + if err != nil { + return err + } + } + return nil +} + func deployPackerGroup(moduleDir string) error { if err := shell.ConfigurePacker(); err != nil { return err @@ -133,9 +154,5 @@ func deployTerraformGroup(groupDir string) error { if err != nil { return err } - - if err = shell.ExportOutputs(tf, artifactsDir, applyBehavior); err != nil { - return err - } - return nil + return shell.ExportOutputs(tf, artifactsDir, applyBehavior) } diff --git a/cmd/root.go b/cmd/root.go index 6da2a8ddce..c5927edac2 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -50,7 +50,7 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.24.0", + Version: "v1.25.0", Annotations: annotation, } ) diff --git a/community/examples/hpc-slurm6.yaml b/community/examples/hpc-slurm6.yaml new file mode 100644 index 0000000000..8478509974 --- /dev/null +++ b/community/examples/hpc-slurm6.yaml @@ -0,0 +1,82 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: hpc-slurm6 + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: slurm-gcp-v6 + region: us-west4 + zone: us-west4-c + instance_image: + family: slurm-gcp-6-1-hpc-rocky-linux-8 + project: schedmd-slurm-public + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: ns1 + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false # the default is: true + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - debug_nodeset + settings: + partition_name: debug + exclusive: false # allows nodes to stay up after jobs are done + is_default: true + + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + name: ns2 + node_count_dynamic_max: 20 + + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - compute_nodeset + settings: + partition_name: compute + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + group_name: login + machine_type: n2-standard-4 + disable_login_public_ips: false + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - debug_partition + - compute_partition + - slurm_login + settings: + disable_controller_public_ips: false diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 4b98516283..28030e9016 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -78,7 +78,7 @@ tomlkit==0.11.8 typing-inspect==0.9.0 typing_extensions==4.6.3 uritemplate==4.1.1 -urllib3==1.26.17 +urllib3==1.26.18 uvicorn==0.22.0 virtualenv==20.23.1 wrapt==1.15.0 diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index 1e3b2fdb41..1dc74cfd3a 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.25.0" } } diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index aeb8407c7d..aa2dcc870c 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -177,7 +177,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | | [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 1e1ad6f50a..44631e8978 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -124,7 +124,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" project_id = var.project_id region = var.region diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index af450849ff..0c757eb836 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.25.0" } } diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index c348a945c4..5e423a0358 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | | [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.22.1 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index 397bb76e67..ce0f7c0c73 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 78d3d681ac..aaf79cc78e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.25.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index f4c4ef24e7..ef1b5d68c2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.25.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md new file mode 100644 index 0000000000..f096b7dd47 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -0,0 +1,44 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | + +## Providers + +No providers. + +## Modules + +No modules. + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | +| [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | +| [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf- | `string` | `null` | no | +| [name](#input\_name) | Name of the nodeset tpu. | `string` | `"ghpc"` | no | +| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | +| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | +| [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | n/a | yes | +| [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | +| [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no | +| [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used. |
object({
email = string
scopes = set(string)
})
| `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | `null` | no | +| [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | `"2.9.1"` | no | +| [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [nodeset\_tpu](#output\_nodeset\_tpu) | Details of the nodeset tpu. Typically used as input to `schedmd-slurm-gcp-v6-partition`. | + diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf new file mode 100644 index 0000000000..18cccb4f4d --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf @@ -0,0 +1,41 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# locals { +# # This label allows for billing report tracking based on module. +# labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" }) +# } + +locals { + + nodeset_tpu = { + node_count_static = var.node_count_static + node_count_dynamic_max = var.node_count_dynamic_max + nodeset_name = var.name + node_type = var.node_type + + accelerator_config = var.accelerator_config + tf_version = var.tf_version + preemptible = var.preemptible + preserve_tpu = var.preserve_tpu + + data_disks = var.data_disks + docker_image = var.docker_image + + enable_public_ip = !var.disable_public_ips + subnetwork = var.subnetwork_self_link + service_account = var.service_account + zone = var.zone + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf new file mode 100644 index 0000000000..654301499e --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/outputs.tf @@ -0,0 +1,18 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "nodeset_tpu" { + description = "Details of the nodeset tpu. Typically used as input to `schedmd-slurm-gcp-v6-partition`." + value = local.nodeset_tpu +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf new file mode 100644 index 0000000000..ac13f2dc5d --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -0,0 +1,118 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "node_count_static" { + description = "Number of nodes to be statically created." + type = number + default = 0 +} + +variable "node_count_dynamic_max" { + description = "Maximum number of dynamic nodes allowed in this partition." + type = number + default = 1 +} + +variable "name" { + description = "Name of the nodeset tpu." + type = string + default = "ghpc" + + validation { + condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) + error_message = "Nodeset TPU name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." + } +} + +variable "disable_public_ips" { + description = "If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access_config is set." + type = bool + default = true +} + +variable "node_type" { + description = "Specify a node type to base the vm configuration upon it." + type = string +} + +variable "accelerator_config" { + description = "Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details." + type = object({ + topology = string + version = string + }) + default = { + topology = "" + version = "" + } + validation { + condition = var.accelerator_config.version == "" ? true : contains(["V2", "V3", "V4"], var.accelerator_config.version) + error_message = "accelerator_config.version must be one of [\"V2\", \"V3\", \"V4\"]" + } + validation { + condition = var.accelerator_config.topology == "" ? true : can(regex("^[1-9]x[1-9](x[1-9])?$", var.accelerator_config.topology)) + error_message = "accelerator_config.topology must be a valid topology, like 2x2 4x4x4 4x2x4 etc..." + } +} + +variable "tf_version" { + description = "Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details." + type = string + default = "2.9.1" +} + +variable "preemptible" { + description = "Should use preemptibles to burst." + type = bool + default = false +} + +variable "preserve_tpu" { + description = "Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted" + type = bool + default = true +} + +variable "zone" { + description = "Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones." + type = string +} + +variable "data_disks" { + description = "The data disks to include in the TPU node" + type = list(string) + default = [] +} + +variable "docker_image" { + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf-" + type = string + default = null +} + +variable "subnetwork_self_link" { + type = string + description = "The name of the subnetwork to attach the TPU-vm of this nodeset to." + default = null +} + +variable "service_account" { + type = object({ + email = string + scopes = set(string) + }) + + description = "Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used." + default = null +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf new file mode 100644 index 0000000000..048120f43e --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_version = ">= 1.3" + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.25.0" + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md new file mode 100644 index 0000000000..829e9796da --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -0,0 +1,200 @@ +## Description + +This module creates a nodeset data structure intended to be input to the +[schedmd-slurm-gcp-v6-partition](../schedmd-slurm-gcp-v6-partition/) module. + +Nodesets allow adding heterogeneous node types to a partition, and hence +running jobs that mix multiple node characteristics. See the [heterogeneous jobs +section][hetjobs] of the SchedMD documentation for more information. + +To specify nodes from a specific nodesets in a partition, the [`--nodelist`] +(or `-w`) flag can be used, for example: + +```bash +srun -N 3 -p compute --nodelist cluster-compute-group-[0-2] hostname +``` + +Where the 3 nodes will be selected from the nodes `cluster-compute-group-[0-2]` +in the compute partition. + +Additionally, depending on how the nodes differ, a constraint can be added via +the [`--constraint`] (or `-C`) flag or other flags such as `--mincpus` can be +used to specify nodes with the desired characteristics. + +[`--nodelist`]: https://slurm.schedmd.com/srun.html#OPT_nodelist +[`--constraint`]: https://slurm.schedmd.com/srun.html#OPT_constraint +[hetjobs]: https://slurm.schedmd.com/heterogeneous_jobs.html + +### Example + +The following code snippet creates a partition module using the `nodeset` +module as input with: + +* a max node count of 200 +* VM machine type of `c2-standard-30` +* partition name of "compute" +* default nodeset name of "ghpc" +* connected to the `network` module via `use` +* nodes mounted to homefs via `use` + +```yaml +- id: nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: + - network + settings: + node_count_dynamic_max: 200 + machine_type: c2-standard-30 + +- id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - homefs + - nodeset + settings: + partition_name: compute +``` + +## Custom Images + +For more information on creating valid custom images for the node group VM +instances or for custom instance templates, see our [vm-images.md] documentation +page. + +[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images + +## GPU Support + +More information on GPU support in Slurm on GCP and other HPC Toolkit modules +can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) + +### Compute VM Zone Policies + +The Slurm on GCP nodeset module allows you to specify additional zones in +which to create VMs through [bulk creation][bulk]. This is valuable when +configuring partitions with popular VM families and you desire access to +more compute resources across zones. + +[bulk]: https://cloud.google.com/compute/docs/instances/multiple/about-bulk-creation +[networkpricing]: https://cloud.google.com/vpc/network-pricing + +> **_WARNING:_** Lenient zone policies can lead to additional egress costs when +> moving large amounts of data between zones in the same region. For example, +> traffic between VMs and traffic from VMs to shared filesystems such as +> Filestore. For more information on egress fees, see the +> [Network Pricing][networkpricing] Google Cloud documentation. +> +> To avoid egress charges, ensure your compute nodes are created in a single +> zone by setting var.zone and leaving var.zones to its default value of the +> empty list. +> +> **_NOTE:_** If a new zone is added to the region while the cluster is active, +> nodes in the partition may be created in that zone. In this case, the +> partition may need to be redeployed to ensure the newly added zone is denied. + +In the zonal example below, the nodeset's zone implicitly defaults to the +deployment variable `vars.zone`: + +```yaml +vars: + zone: us-central1-f + +- id: zonal-nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset +``` + +In the example below, we enable creation in additional zones: + +```yaml +vars: + zone: us-central1-f + +- id: multi-zonal-nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + settings: + zones: + - us-central1-a + - us-central1-b +``` + +## Support +The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform +modules. For support with the underlying modules, see the instructions in the +[slurm-gcp README][slurm-gcp-readme]. + +[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp +[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google](#requirement\_google) | >= 3.83 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | +| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | +| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | +| [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | +| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | +| [name](#input\_name) | Name of the nodeset. | `string` | `"ghpc"` | no | +| [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | +| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no | +| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | +| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | +| [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | +| [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | +| [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | +| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | +| [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | +| [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | +| [zones](#input\_zones) | Additional nodes in which to allow creation of partition nodes. Google Cloud
will find zone based on availability, quota and reservations. | `set(string)` | `[]` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [nodeset](#output\_nodeset) | Details of the nodeset. Typically used as input to `schedmd-slurm-gcp-v6-partition`. | + diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf new file mode 100644 index 0000000000..c3c16542b1 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, + } + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf new file mode 100644 index 0000000000..11acd4b963 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -0,0 +1,79 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" }) +} + +locals { + additional_disks = [ + for ad in var.additional_disks : { + disk_name = ad.disk_name + device_name = ad.device_name + disk_type = ad.disk_type + disk_size_gb = ad.disk_size_gb + disk_labels = merge(ad.disk_labels, local.labels) + auto_delete = ad.auto_delete + boot = ad.boot + } + ] + + nodeset = { + node_count_static = var.node_count_static + node_count_dynamic_max = var.node_count_dynamic_max + node_conf = var.node_conf + nodeset_name = var.name + + disk_auto_delete = var.disk_auto_delete + disk_labels = merge(local.labels, var.disk_labels) + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + additional_disks = local.additional_disks + + bandwidth_tier = var.bandwidth_tier + can_ip_forward = var.can_ip_forward + disable_smt = !var.enable_smt + + enable_confidential_vm = var.enable_confidential_vm + enable_placement = var.enable_placement + enable_public_ip = !var.disable_public_ips + enable_oslogin = var.enable_oslogin + enable_shielded_vm = var.enable_shielded_vm + gpu = one(local.guest_accelerator) + + instance_template = var.instance_template + labels = local.labels + machine_type = var.machine_type + metadata = var.metadata + min_cpu_platform = var.min_cpu_platform + + on_host_maintenance = var.on_host_maintenance + preemptible = var.preemptible + region = var.region + service_account = var.service_account + shielded_instance_config = var.shielded_instance_config + source_image_family = local.source_image_family # requires source_image_logic.tf + source_image_project = local.source_image_project_normalized # requires source_image_logic.tf + source_image = local.source_image # requires source_image_logic.tf + subnetwork_project = var.subnetwork_project + subnetwork = var.subnetwork_self_link + tags = var.tags + spot = var.enable_spot_vm + termination_action = try(var.spot_instance_config.termination_action, null) + + zones = toset(concat([var.zone], tolist(var.zones))) + zone_target_shape = var.zone_target_shape + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf new file mode 100644 index 0000000000..450d6e7cd4 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -0,0 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "nodeset" { + description = "Details of the nodeset. Typically used as input to `schedmd-slurm-gcp-v6-partition`." + value = local.nodeset + + precondition { + condition = !contains([ + "c3-:pd-standard", + "h3-:pd-standard", + "h3-:pd-ssd", + ], "${substr(var.machine_type, 0, 3)}:${var.disk_type}") + error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}." + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf new file mode 100644 index 0000000000..8759a268cc --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -0,0 +1,73 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # Currently supported images and projects + known_project_families = { + schedmd-slurm-public = [ + "slurm-gcp-6-1-debian-11", + "slurm-gcp-6-1-hpc-rocky-linux-8", + "slurm-gcp-6-1-ubuntu-2004-lts", + "slurm-gcp-6-1-ubuntu-2204-lts-arm64", + "slurm-gcp-6-1-hpc-centos-7-k80", + "slurm-gcp-6-1-hpc-centos-7" + ] + } + + # This approach to "hacking" the project name allows a chain of Terraform + # calls to set the instance source_image (boot disk) with a "relative + # resource name" that passes muster with VPC Service Control rules + # + # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 + # https://cloud.google.com/apis/design/resource_names#relative_resource_name + source_image_project_normalized = (can(var.instance_image.family) ? + "projects/${data.google_compute_image.slurm.project}/global/images/family" : + "projects/${data.google_compute_image.slurm.project}/global/images" + ) + source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" + source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" +} + +data "google_compute_image" "slurm" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = var.instance_image.project + + lifecycle { + precondition { + condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 + error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." + } + + postcondition { + condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) + error_message = <<-EOD + Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. + EOD + } + postcondition { + condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) + error_message = <<-EOD + Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: + ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} + EOD + } + postcondition { + condition = var.disk_size_gb >= self.disk_size_gb + error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" + } + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf new file mode 100644 index 0000000000..d00219cf73 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -0,0 +1,388 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "name" { + description = "Name of the nodeset." + type = string + default = "ghpc" + + validation { + condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) + error_message = "Nodeset name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." + } +} + +variable "node_conf" { + description = "Map of Slurm node line configuration." + type = map(any) + default = {} +} + +variable "node_count_static" { + description = "Number of nodes to be statically created." + type = number + default = 0 +} + +variable "node_count_dynamic_max" { + description = "Maximum number of dynamic nodes allowed in this partition." + type = number + default = 1 +} + +## VM Definition +variable "instance_template" { + description = <<-EOD + Self link to a custom instance template. If set, other VM definition + variables such as machine_type and instance_image will be ignored in favor + of the provided instance template. + + For more information on creating custom images for the instance template + that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section + in docs/vm-images.md. + EOD + type = string + default = null +} + +variable "machine_type" { + description = "Compute Platform machine type to use for this partition compute nodes." + type = string + default = "c2-standard-60" +} + +variable "metadata" { + type = map(string) + description = "Metadata, provided as a map." + default = {} +} + +variable "instance_image" { + description = <<-EOD + Defines the image that will be used in the Slurm node group VM instances. + + Expected Fields: + name: The name of the image. Mutually exclusive with family. + family: The image family to use. Mutually exclusive with name. + project: The project where the image is hosted. + + For more information on creating custom images that comply with Slurm on GCP + see the "Slurm on GCP Custom Images" section in docs/vm-images.md. + EOD + type = map(string) + default = { + family = "slurm-gcp-6-1-hpc-rocky-linux-8" + project = "schedmd-slurm-public" + } + + validation { + condition = can(coalesce(var.instance_image.project)) + error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID." + } + + validation { + condition = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family)) + error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name." + } +} + +variable "instance_image_custom" { + description = <<-EOD + A flag that designates that the user is aware that they are requesting + to use a custom and potentially incompatible image for this Slurm on + GCP module. + + If the field is set to false, only the compatible families and project + names will be accepted. The deployment will fail with any other image + family or name. If set to true, no checks will be done. + + See: https://goo.gle/hpc-slurm-images + EOD + type = bool + default = false +} + +variable "tags" { + type = list(string) + description = "Network tag list." + default = [] +} + +variable "disk_type" { + description = "Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme." + type = string + default = "pd-standard" + + validation { + condition = contains(["pd-ssd", "pd-standard", "pd-balanced", "pd-extreme"], var.disk_type) + error_message = "Variable disk_type must be one of pd-ssd, pd-standard, pd-balanced, or pd-extreme." + } +} + +variable "disk_size_gb" { + description = "Size of boot disk to create for the partition compute nodes." + type = number + default = 50 +} + +variable "disk_auto_delete" { + type = bool + description = "Whether or not the boot disk should be auto-deleted." + default = true +} + +variable "disk_labels" { + description = "Labels specific to the boot disk. These will be merged with var.labels." + type = map(string) + default = {} +} + +variable "additional_disks" { + description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + type = list(object({ + disk_name = string + device_name = string + disk_size_gb = number + disk_type = string + disk_labels = map(string) + auto_delete = bool + boot = bool + })) + default = [] +} + +variable "enable_confidential_vm" { + type = bool + description = "Enable the Confidential VM configuration. Note: the instance image must support option." + default = false +} + +variable "enable_shielded_vm" { + type = bool + description = "Enable the Shielded VM configuration. Note: the instance image must support option." + default = false +} + +variable "shielded_instance_config" { + type = object({ + enable_integrity_monitoring = bool + enable_secure_boot = bool + enable_vtpm = bool + }) + description = <<-EOD + Shielded VM configuration for the instance. Note: not used unless + enable_shielded_vm is 'true'. + - enable_integrity_monitoring : Compare the most recent boot measurements to the + integrity policy baseline and return a pair of pass/fail results depending on + whether they match or not. + - enable_secure_boot : Verify the digital signature of all boot components, and + halt the boot process if signature verification fails. + - enable_vtpm : Use a virtualized trusted platform module, which is a + specialized computer chip you can use to encrypt objects like keys and + certificates. + EOD + default = { + enable_integrity_monitoring = true + enable_secure_boot = true + enable_vtpm = true + } +} + + +variable "enable_oslogin" { + type = bool + description = <<-EOD + Enables Google Cloud os-login for user login and authentication for VMs. + See https://cloud.google.com/compute/docs/oslogin + EOD + default = true +} + +variable "can_ip_forward" { + description = "Enable IP forwarding, for NAT instances for example." + type = bool + default = false +} + +variable "enable_smt" { + type = bool + description = "Enables Simultaneous Multi-Threading (SMT) on instance." + default = false +} + +variable "labels" { + description = "Labels to add to partition compute instances. Key-value pairs." + type = map(string) + default = {} +} + +variable "min_cpu_platform" { + description = "The name of the minimum CPU platform that you want the instance to use." + type = string + default = null +} + +variable "on_host_maintenance" { + type = string + description = <<-EOD + Instance availability Policy. + + Note: Placement groups are not supported when on_host_maintenance is set to + "MIGRATE" and will be deactivated regardless of the value of + enable_placement. To support enable_placement, ensure on_host_maintenance is + set to "TERMINATE". + EOD + default = "TERMINATE" +} + +variable "guest_accelerator" { + description = "List of the type and count of accelerator cards attached to the instance." + type = list(object({ + type = string, + count = number + })) + default = [] + nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." + } +} + +variable "preemptible" { + description = "Should use preemptibles to burst." + type = bool + default = false +} + +variable "service_account" { + type = object({ + email = string + scopes = set(string) + }) + description = <<-EOD + Service account to attach to the compute instances. If not set, the + default compute service account for the given project will be used with the + "https://www.googleapis.com/auth/cloud-platform" scope. + EOD + default = null +} + +variable "enable_spot_vm" { + description = "Enable the partition to use spot VMs (https://cloud.google.com/spot-vms)." + type = bool + default = false +} + +variable "spot_instance_config" { + description = "Configuration for spot VMs." + type = object({ + termination_action = string + }) + default = null +} + +variable "bandwidth_tier" { + description = < 0 + ]) + error_message = "A value in var.zones is not a valid zone (example: us-central1-f)." + } +} + +variable "zone_target_shape" { + description = < +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | + +## Providers + +No providers. + +## Modules + +No modules. + +## Resources + +No resources. + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | +| [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | +| [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [nodeset](#output\_nodeset) | Details of a nodesets in this partition | +| [nodeset\_tpu](#output\_nodeset\_tpu) | Details of a nodesets tpu in this partition | +| [partitions](#output\_partitions) | Details of a slurm partition | + diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf new file mode 100644 index 0000000000..34f98912bb --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + + use_placement = [for ns in var.partition_conf : ns.nodeset_name if ns.enable_placement] + + partition = { + default = var.is_default + enable_job_exclusive = var.exclusive + network_storage = var.network_storage + partition_conf = var.partition_conf + partition_name = var.partition_name + partition_nodeset = [for ns in var.nodeset : ns.nodeset_name] + partition_nodeset_tpu = [for ns in var.nodeset_tpu : ns.nodeset_name] + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf new file mode 100644 index 0000000000..cf7e00d610 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "partitions" { + description = "Details of a slurm partition" + + value = [local.partition] + + precondition { + condition = (length(local.use_placement) == 0) || var.exclusive + error_message = "If any nodeset `enable_placement`, `var.exclusive` must be set true" + } + + precondition { + condition = (length(local.use_placement) == 0) || contains(["NO", "Exclusive"], lookup(var.partition_conf, "Oversubscribe", "NO")) + error_message = "If any nodeset `enable_placement`, var.partition_conf[\"Oversubscribe\"] should be either undefined, \"NO\", or \"Exclusive\"." + } + + precondition { + condition = (length(local.use_placement) == 0) || (lookup(var.partition_conf, "SuspendTime", null) == null) + error_message = "If any nodeset `enable_placement`, var.partition_conf[\"SuspendTime\"] should be undefined." + } +} + +output "nodeset" { + description = "Details of a nodesets in this partition" + + value = var.nodeset +} + +output "nodeset_tpu" { + description = "Details of a nodesets tpu in this partition" + + value = var.nodeset_tpu +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf new file mode 100644 index 0000000000..56fda6e4d6 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -0,0 +1,165 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "partition_name" { + description = "The name of the slurm partition." + type = string + + validation { + condition = can(regex("^[a-z](?:[a-z0-9]*)$", var.partition_name)) + error_message = "Variable 'partition_name' must be a match of regex '^[a-z](?:[a-z0-9]*)$'." + } +} + +variable "partition_conf" { + description = <<-EOD + Slurm partition configuration as a map. + See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION + EOD + type = map(string) + default = {} +} + +variable "is_default" { + description = <<-EOD + Sets this partition as the default partition by updating the partition_conf. + If "Default" is already set in partition_conf, this variable will have no effect. + EOD + type = bool + default = false +} + +variable "exclusive" { + description = "Exclusive job access to nodes." + type = bool + default = true +} + +variable "network_storage" { + description = "An array of network attached storage mounts to be configured on the partition compute nodes." + type = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string, + client_install_runner = map(string) + mount_runner = map(string) + })) + default = [] +} + +variable "nodeset" { + description = "Define nodesets, as a list." + type = list(object({ + node_count_static = optional(number, 0) + node_count_dynamic_max = optional(number, 1) + node_conf = optional(map(string), {}) + nodeset_name = string + additional_disks = optional(list(object({ + disk_name = optional(string) + device_name = optional(string) + disk_size_gb = optional(number) + disk_type = optional(string) + disk_labels = optional(map(string), {}) + auto_delete = optional(bool, true) + boot = optional(bool, false) + })), []) + bandwidth_tier = optional(string, "platform_default") + can_ip_forward = optional(bool, false) + disable_smt = optional(bool, false) + disk_auto_delete = optional(bool, true) + disk_labels = optional(map(string), {}) + disk_size_gb = optional(number) + disk_type = optional(string) + enable_confidential_vm = optional(bool, false) + enable_placement = optional(bool, false) + enable_public_ip = optional(bool, false) + enable_oslogin = optional(bool, true) + enable_shielded_vm = optional(bool, false) + gpu = optional(object({ + count = number + type = string + })) + instance_template = optional(string) + labels = optional(map(string), {}) + machine_type = optional(string) + metadata = optional(map(string), {}) + min_cpu_platform = optional(string) + network_tier = optional(string, "STANDARD") + on_host_maintenance = optional(string) + preemptible = optional(bool, false) + region = optional(string) + service_account = optional(object({ + email = optional(string) + scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) + })) + shielded_instance_config = optional(object({ + enable_integrity_monitoring = optional(bool, true) + enable_secure_boot = optional(bool, true) + enable_vtpm = optional(bool, true) + })) + source_image_family = optional(string) + source_image_project = optional(string) + source_image = optional(string) + subnetwork_project = optional(string) + subnetwork = optional(string) + spot = optional(bool, false) + tags = optional(list(string), []) + termination_action = optional(string) + zones = optional(list(string), []) + zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + })) + default = [] + + validation { + condition = length(distinct([for x in var.nodeset : x.nodeset_name])) == length(var.nodeset) + error_message = "All nodesets must have a unique name." + } +} + +variable "nodeset_tpu" { + description = "Define TPU nodesets, as a list." + type = list(object({ + node_count_static = optional(number, 0) + node_count_dynamic_max = optional(number, 1) + nodeset_name = string + enable_public_ip = optional(bool, false) + node_type = string + accelerator_config = optional(object({ + topology = string + version = string + }), { + topology = "" + version = "" + }) + tf_version = string + preemptible = optional(bool, false) + preserve_tpu = optional(bool, true) + zone = string + data_disks = optional(list(string), []) + docker_image = optional(string, "") + subnetwork = optional(string, "") + service_account = optional(object({ + email = optional(string) + scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) + })) + })) + default = [] + + validation { + condition = length(distinct([for x in var.nodeset_tpu : x.nodeset_name])) == length(var.nodeset_tpu) + error_message = "All TPU nodesets must have a unique name." + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf new file mode 100644 index 0000000000..32a9d04446 --- /dev/null +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_version = ">= 1.3" + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.25.0" + } +} diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 9c3340cf23..b8d5c840c0 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.25.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.25.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/scripts/install-gcs-fuse.sh b/community/modules/file-system/cloud-storage-bucket/scripts/install-gcs-fuse.sh index 04498fd849..f8a990260b 100644 --- a/community/modules/file-system/cloud-storage-bucket/scripts/install-gcs-fuse.sh +++ b/community/modules/file-system/cloud-storage-bucket/scripts/install-gcs-fuse.sh @@ -32,11 +32,11 @@ EOF elif [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release || grep -qi ubuntu /etc/os-release; then RELEASE=$(lsb_release -c -s) export GCSFUSE_REPO="gcsfuse-${RELEASE}" - echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list + echo "deb https://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - - sudo apt-get update - sudo apt-get -y install gcsfuse + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label + apt-get -y install gcsfuse else echo 'Unsuported distribution' return 1 diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index c49c2f8ff5..1c2968c7e8 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.25.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 33af1b3c8f..d756d07daf 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.25.0" } } diff --git a/community/modules/file-system/nfs-server/scripts/install-nfs-client.sh b/community/modules/file-system/nfs-server/scripts/install-nfs-client.sh index 6c49163eb2..7ce1a59ed3 100644 --- a/community/modules/file-system/nfs-server/scripts/install-nfs-client.sh +++ b/community/modules/file-system/nfs-server/scripts/install-nfs-client.sh @@ -28,7 +28,7 @@ if [ ! "$(which mount.nfs)" ]; then fi yum install --disablerepo="*" --enablerepo=${enable_repo} -y nfs-utils elif [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release || grep -qi ubuntu /etc/os-release; then - apt-get -y update + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label apt-get -y install nfs-common else echo 'Unsuported distribution' diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 42c66dafaf..a194180c1d 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.25.0" } required_version = ">= 0.14.0" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index a0838935d7..ee8ce39781 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.25.0" } required_version = ">= 0.14.0" diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 96eea6154b..35fca6cfc6 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,7 +63,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | | [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.22.1 | ## Resources @@ -85,7 +85,7 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image. An alternative could be family = "ubuntu-2204-lts" and project = "ubuntu-os-cloud" or family = "debian-11" and project = "debian-cloud" |
object({
family = string,
project = string
})
|
{
"family": "ubuntu-2204-lts",
"project": "ubuntu-os-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is from
family= \"debian-11\" and project = \"debian-cloud\". An alternative image is
from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\". |
object({
family = string,
project = string
})
|
{
"family": "debian-11",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 846a0a6899..3a71873a94 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml index cc9607243b..e67f8f65d0 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml +++ b/community/modules/remote-desktop/chrome-remote-desktop/scripts/configure-grid-drivers.yml @@ -17,26 +17,32 @@ hosts: localhost become: true vars: - packages: + dist_settings: bullseye: - - build-essential - - gdebi-core - - mesa-utils - - gdm3 + packages: + - build-essential + - gdebi-core + - mesa-utils + - gdm3 + grid_fn: NVIDIA-Linux-x86_64-510.85.02-grid.run + grid_ver: vGPU14.2 jammy: - - build-essential - - gdebi-core - - mesa-utils - - gdm3 - - gcc-12 # must match compiler used to build kernel on latest Ubuntu 22 - - pkg-config # observed to be necessary for GRID driver installation on latest Ubuntu 22 - - libglvnd-dev # observed to be necessary for GRID driver installation on latest Ubuntu 22 + packages: + - build-essential + - gdebi-core + - mesa-utils + - gdm3 + - gcc-12 # must match compiler used to build kernel on latest Ubuntu 22 + - pkg-config # observed to be necessary for GRID driver installation on latest Ubuntu 22 + - libglvnd-dev # observed to be necessary for GRID driver installation on latest Ubuntu 22 + grid_fn: NVIDIA-Linux-x86_64-525.125.06-grid.run + grid_ver: vGPU15.3 tasks: - name: Fail if using wrong OS ansible.builtin.assert: that: - ansible_os_family in ["Debian", "Ubuntu"] - - ansible_distribution_release in ["bullseye", "jammy"] + - ansible_distribution_release in {{ dist_settings.keys()|list }} fail_msg: "ansible_os_family: {{ ansible_os_family }} or ansible_distribution_release: {{ansible_distribution_release}} was not acceptable." - name: Check if GRID driver installed @@ -52,7 +58,7 @@ - name: Set all packages ansible.builtin.set_fact: - all_packages: '{{ packages[ansible_distribution_release] + ["linux-headers-" + uname_result.stdout] }}' + all_packages: '{{ dist_settings[ansible_distribution_release]["packages"] + ["linux-headers-" + uname_result.stdout] }}' - name: Install binaries for GRID drivers ansible.builtin.apt: @@ -67,15 +73,9 @@ - name: Install GRID driver if not existing when: nvidiasmi_result is failed block: - - name: Download GPU driver 15.3 + - name: Download GPU driver ansible.builtin.get_url: - url: https://storage.googleapis.com/nvidia-drivers-us-public/GRID/vGPU15.3/NVIDIA-Linux-x86_64-525.125.06-grid.run - dest: /tmp/ - mode: "0755" - timeout: 30 - - name: Download GPU driver 14.2 - ansible.builtin.get_url: - url: https://storage.googleapis.com/nvidia-drivers-us-public/GRID/vGPU14.2/NVIDIA-Linux-x86_64-510.85.02-grid.run + url: https://storage.googleapis.com/nvidia-drivers-us-public/GRID/{{ dist_settings[ansible_distribution_release]["grid_ver"] }}/{{ dist_settings[ansible_distribution_release]["grid_fn"] }} dest: /tmp/ mode: "0755" timeout: 30 @@ -88,10 +88,10 @@ - name: Install GPU driver ansible.builtin.shell: | #jinja2: trim_blocks: "True" - {% if ansible_distribution_release == "jammy" %} - CC=gcc-12 /tmp/NVIDIA-Linux-x86_64-525.125.06-grid.run --silent + {% if ansible_distribution_release == "jammy" %} + CC=gcc-12 /tmp/{{ dist_settings[ansible_distribution_release]["grid_fn"] }} --silent {% else %} - /tmp/NVIDIA-Linux-x86_64-510.85.02-grid.run --silent + /tmp/{{ dist_settings[ansible_distribution_release]["grid_fn"] }} --silent {% endif %} register: result changed_when: result.rc == 0 diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index 18f7b7a163..8edbfb2006 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -56,14 +56,18 @@ variable "network_storage" { } variable "instance_image" { - description = "Instance Image. An alternative could be family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\" or family = \"debian-11\" and project = \"debian-cloud\"" + description = <<-EOT + Image used to build chrome remote desktop node. The default image is from + family= \"debian-11\" and project = \"debian-cloud\". An alternative image is + from family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\". + EOT type = object({ family = string, project = string }) default = { - family = "ubuntu-2204-lts" - project = "ubuntu-os-cloud" + family = "debian-11" + project = "debian-cloud" } } diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf index d2bca83f21..86cb08576f 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.25.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf index 506a413e8a..7bf79d5234 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.25.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index a333ace1d4..08c3231898 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.25.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 5995fa087c..e796f9a1e9 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -63,7 +63,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 652462b43f..9d4a8fb4bc 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -142,7 +142,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 4b1efe245e..26d489efd4 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.25.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 35e296a342..60e715603b 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -99,7 +99,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | | [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 8e02f329a1..a9a492c2cc 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -109,7 +109,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index a6ed99f198..145749f003 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.25.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/README.md b/community/modules/scheduler/htcondor-pool-secrets/README.md index b4974eb642..4743bc8462 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/README.md +++ b/community/modules/scheduler/htcondor-pool-secrets/README.md @@ -119,14 +119,14 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 0.13.0 | -| [google](#requirement\_google) | >= 3.83, <5.0 | +| [google](#requirement\_google) | >= 4.84 | | [random](#requirement\_random) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 3.83, <5.0 | +| [google](#provider\_google) | >= 4.84 | | [random](#provider\_random) | >= 3.0 | ## Modules diff --git a/community/modules/scheduler/htcondor-pool-secrets/main.tf b/community/modules/scheduler/htcondor-pool-secrets/main.tf index e700ee84de..c6611f6b14 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/main.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/main.tf @@ -82,7 +82,7 @@ resource "google_secret_manager_secret" "pool_password" { labels = local.labels replication { - automatic = true + auto {} } } @@ -98,7 +98,7 @@ resource "google_secret_manager_secret" "execute_point_idtoken" { labels = local.labels replication { - automatic = true + auto {} } } diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 4abf362581..8e7b1cd442 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 3.83, <5.0" + version = ">= 4.84" } random = { source = "hashicorp/random" @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.25.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 4c425b682c..deccfbde4b 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.25.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 221b68976a..9e4eb04b84 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | | [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.22.1 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index 9c3e4ce314..ec8c7a47d2 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 488ed23157..92c7a854c3 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -72,7 +72,7 @@ No providers. | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 | | [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.22.1 | | [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.22.1 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 71fbf25e40..8099c9fdb9 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" deployment_name = var.deployment_name project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index b197c3f475..941461bf79 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.25.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 14b1f50a53..3de7e977b9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.25.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md new file mode 100644 index 0000000000..54f73f9429 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -0,0 +1,206 @@ +## Description + +This module creates a slurm controller node via the [SchedMD/slurm-gcp] +[slurm\_controller\_instance] and [slurm\_instance\_template] modules. + +More information about Slurm On GCP can be found at the +[project's GitHub page][SchedMD/slurm-gcp] and in the +[Slurm on Google Cloud User Guide][slurm-ug]. + +The [user guide][slurm-ug] provides detailed instructions on customizing and +enhancing the Slurm on GCP cluster as well as recommendations on configuring the +controller for optimal performance at different scales. + +[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/6.1.2 +[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/6.1.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/6.1.2/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-ug]: https://goo.gle/slurm-gcp-user-guide. +[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/6.1.2/scripts/requirements.txt +[enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute +[enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions +[enable\_reconfigure]: #input\_enable\_reconfigure + +### Example + +```yaml +- id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - homefs + - compute_partition + settings: + machine_type: c2-standard-8 +``` + +This creates a controller node with the following attributes: + +* connected to the primary subnetwork of `network` +* the filesystem with the ID `homefs` (defined elsewhere in the blueprint) + mounted +* One partition with the ID `compute_partition` (defined elsewhere in the + blueprint) +* machine type upgraded from the default `c2-standard-4` to `c2-standard-8` + +### Live Cluster Reconfiguration + +The `schedmd-slurm-gcp-v6-controller` module supports the reconfiguration of +partitions and slurm configuration in a running, active cluster. + +To reconfigure a running cluster: + +1. Edit the blueprint with the desired configuration changes +2. Call `ghpc create -w` to overwrite the deployment directory +3. Follow instructions in terminal to deploy + +The following are examples of updates that can be made to a running cluster: + +* Add or remove a partition to the cluster +* Resize an existing partition +* Attach new network storage to an existing partition + +> **NOTE**: Changing the VM `machine_type` of a partition may not work. +> It is better to create a new partition and delete the old one. + +## Custom Images + +For more information on creating valid custom images for the controller VM +instance or for custom instance templates, see our [vm-images.md] documentation +page. + +[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images + +## GPU Support + +More information on GPU support in Slurm on GCP and other HPC Toolkit modules +can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) + +## Hybrid Slurm Clusters +For more information on how to configure an on premise slurm cluster with hybrid +cloud partitions, see the [schedmd-slurm-gcp-v5-hybrid] module and our +extended instructions in our [docs](../../../../docs/hybrid-slurm-cluster/). + +[schedmd-slurm-gcp-v5-hybrid]: ../schedmd-slurm-gcp-v5-hybrid/README.md + +## Support +The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform +modules. For support with the underlying modules, see the instructions in the +[slurm-gcp README][slurm-gcp-readme]. + +[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp +[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp#slurm-on-google-cloud-platform + +## License + + +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google](#requirement\_google) | >= 3.83 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 3.0 | +| [slurm\_cluster](#module\_slurm\_cluster) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster | 6.1.2 | + +## Resources + +| Name | Type | +|------|------| +| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. If not specified, then one will be chosen based on slurm\_cluster\_name. | `string` | `null` | no | +| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | +| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = optional(bool, false)
resume_rate = optional(number, 0)
resume_timeout = optional(number, 300)
suspend_rate = optional(number, 0)
suspend_timeout = optional(number, 300)
})
| `{}` | no | +| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | +| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | +| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | +| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | +| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | +| [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | +| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | +| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | +| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | +| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | +| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | +| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | +| [enable\_login](#input\_enable\_login) | Enables the creation of login nodes and instance templates. | `bool` | `true` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
group_name = string
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | +| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | +| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string) # TODO: is it used? should remove it?
mount_runner = map(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_public_ip = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_project = optional(string)
subnetwork = optional(string)
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, true)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
subnetwork = optional(string, "")
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
}))
| `[]` | no | +| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | +| [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
default = optional(bool, false)
enable_job_exclusive = optional(bool, false)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
partition_conf = optional(map(string), {})
partition_name = string
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
resume_timeout = optional(number)
suspend_time = optional(number, 300)
suspend_timeout = optional(number)
}))
| n/a | yes | +| [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | +| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | +| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [region](#input\_region) | The default region to place resources in. | `string` | n/a | yes | +| [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting.
If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | +| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | +| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | +| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | +| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | + +## Outputs + +No outputs. + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf new file mode 100644 index 0000000000..c3c16542b1 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, + } + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf new file mode 100644 index 0000000000..6067feea23 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf @@ -0,0 +1,180 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-controller", ghpc_role = "scheduler" }) +} + +locals { + ghpc_startup_script_controller = [{ + filename = "ghpc_startup.sh" + content = var.controller_startup_script + }] + ghpc_startup_script_login = [{ + filename = "ghpc_startup.sh" + content = var.login_startup_script + }] + ghpc_startup_script_compute = [{ + filename = "ghpc_startup.sh" + content = var.compute_startup_script + }] + + # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning + # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string + tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) + slurm_cluster_name = coalesce(var.slurm_cluster_name, local.tmp_cluster_name) + +} + +locals { + synt_suffix = substr(md5("${var.project_id}${var.deployment_name}"), 0, 5) + synth_bucket_name = "${local.slurm_cluster_name}${local.synt_suffix}" +} + +module "bucket" { + source = "terraform-google-modules/cloud-storage/google" + version = "~> 3.0" + + count = var.create_bucket ? 1 : 0 + + location = var.region + names = [local.synth_bucket_name] + prefix = "slurm" + project_id = var.project_id + + force_destroy = { + (local.synth_bucket_name) = true + } + + labels = { + slurm_cluster_name = local.slurm_cluster_name + } +} + +data "google_compute_default_service_account" "default" { + project = var.project_id +} + +locals { # controller_instance_config + additional_disks = [ + for ad in var.additional_disks : { + disk_name = ad.disk_name + device_name = ad.device_name + disk_type = ad.disk_type + disk_size_gb = ad.disk_size_gb + disk_labels = merge(ad.disk_labels, local.labels) + auto_delete = ad.auto_delete + boot = ad.boot + } + ] + + controller_instance_config = { + disk_auto_delete = var.disk_auto_delete + disk_labels = merge(var.disk_labels, local.labels) + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + additional_disks = local.additional_disks + + can_ip_forward = var.can_ip_forward + disable_smt = var.disable_smt + + enable_confidential_vm = var.enable_confidential_vm + enable_public_ip = !var.disable_controller_public_ips + enable_oslogin = var.enable_oslogin + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config + + gpu = one(local.guest_accelerator) + instance_template = var.instance_template + labels = local.labels + machine_type = var.machine_type + metadata = var.metadata + min_cpu_platform = var.min_cpu_platform + + on_host_maintenance = var.on_host_maintenance + preemptible = var.preemptible + region = var.region + zone = var.zone + + service_account = coalesce(var.service_account, { + email = data.google_compute_default_service_account.default.email + scopes = ["https://www.googleapis.com/auth/cloud-platform"] + }) + + source_image_family = local.source_image_family # requires source_image_logic.tf + source_image_project = local.source_image_project_normalized # requires source_image_logic.tf + source_image = local.source_image # requires source_image_logic.tf + + static_ip = try(var.static_ips[0], null) + bandwidth_tier = var.bandwidth_tier + + subnetwork = var.subnetwork_self_link + subnetwork_project = var.subnetwork_project + + tags = var.tags + } +} + +module "slurm_cluster" { + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster?ref=6.1.2" + + project_id = var.project_id + slurm_cluster_name = local.slurm_cluster_name + region = var.region + + create_bucket = false + bucket_name = var.create_bucket ? module.bucket[0].name : var.bucket_name + bucket_dir = var.bucket_dir + + controller_instance_config = local.controller_instance_config + + enable_login = var.enable_login + login_nodes = var.login_nodes + + nodeset = var.nodeset + nodeset_tpu = var.nodeset_tpu + + partitions = var.partitions + + enable_devel = var.enable_devel + enable_debug_logging = var.enable_debug_logging + extra_logging_flags = var.extra_logging_flags + enable_cleanup_compute = var.enable_cleanup_compute + enable_bigquery_load = var.enable_bigquery_load + cloud_parameters = var.cloud_parameters + disable_default_mounts = var.disable_default_mounts + + network_storage = var.network_storage + login_network_storage = var.network_storage + + slurmdbd_conf_tpl = var.slurmdbd_conf_tpl + slurm_conf_tpl = var.slurm_conf_tpl + cgroup_conf_tpl = var.cgroup_conf_tpl + + controller_startup_scripts = local.ghpc_startup_script_controller + controller_startup_scripts_timeout = var.controller_startup_scripts_timeout + login_startup_scripts = local.ghpc_startup_script_login + login_startup_scripts_timeout = var.login_startup_scripts_timeout + compute_startup_scripts = local.ghpc_startup_script_compute + compute_startup_scripts_timeout = var.compute_startup_scripts_timeout + + prolog_scripts = var.prolog_scripts + epilog_scripts = var.epilog_scripts + cloudsql = var.cloudsql + + depends_on = [module.bucket] +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf new file mode 100644 index 0000000000..8759a268cc --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -0,0 +1,73 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # Currently supported images and projects + known_project_families = { + schedmd-slurm-public = [ + "slurm-gcp-6-1-debian-11", + "slurm-gcp-6-1-hpc-rocky-linux-8", + "slurm-gcp-6-1-ubuntu-2004-lts", + "slurm-gcp-6-1-ubuntu-2204-lts-arm64", + "slurm-gcp-6-1-hpc-centos-7-k80", + "slurm-gcp-6-1-hpc-centos-7" + ] + } + + # This approach to "hacking" the project name allows a chain of Terraform + # calls to set the instance source_image (boot disk) with a "relative + # resource name" that passes muster with VPC Service Control rules + # + # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 + # https://cloud.google.com/apis/design/resource_names#relative_resource_name + source_image_project_normalized = (can(var.instance_image.family) ? + "projects/${data.google_compute_image.slurm.project}/global/images/family" : + "projects/${data.google_compute_image.slurm.project}/global/images" + ) + source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" + source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" +} + +data "google_compute_image" "slurm" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = var.instance_image.project + + lifecycle { + precondition { + condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 + error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." + } + + postcondition { + condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) + error_message = <<-EOD + Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. + EOD + } + postcondition { + condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) + error_message = <<-EOD + Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: + ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} + EOD + } + postcondition { + condition = var.disk_size_gb >= self.disk_size_gb + error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" + } + } +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf new file mode 100644 index 0000000000..aac6217259 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -0,0 +1,506 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +########### +# GENERAL # +########### + +variable "project_id" { + type = string + description = "Project ID to create resources in." +} + +variable "deployment_name" { + description = "Name of the deployment." + type = string +} + +variable "slurm_cluster_name" { + type = string + description = <<-EOD + Cluster name, used for resource naming and slurm accounting. + If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). + EOD + default = null + + validation { + condition = var.slurm_cluster_name == null || can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) + error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." + } +} + +variable "region" { + type = string + description = "The default region to place resources in." +} + +variable "zone" { + type = string + description = < 0 + error_message = "Partitions cannot be empty." + } +} + +######### +# SLURM # +######### + +variable "enable_devel" { + type = bool + description = "Enables development mode. Not for production use." + default = false +} + +variable "enable_debug_logging" { + type = bool + description = "Enables debug logging mode. Not for production use." + default = false +} + +variable "extra_logging_flags" { + type = map(bool) + description = "The list of extra flags for the logging system to use. See the logging_flags variable in scripts/util.py to get the list of supported log flags." + default = {} +} + +variable "enable_cleanup_compute" { + description = < +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google](#requirement\_google) | >= 3.83 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | +| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login node will have a random public IP assigned to it. | `bool` | `true` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | +| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | +| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | +| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | +| [group\_name](#input\_group\_name) | Name of the login nodes group. | `string` | n/a | yes | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-1-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | +| [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | +| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | +| [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no | +| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | +| [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | +| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | +| [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no | +| [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | +| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | +| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [login\_nodes](#output\_login\_nodes) | Slurm login instance definition. | + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf new file mode 100644 index 0000000000..c3c16542b1 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, + } + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) + + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf new file mode 100644 index 0000000000..9cb10031ca --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -0,0 +1,85 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-login", ghpc_role = "scheduler" }) +} + +data "google_compute_default_service_account" "default" { + project = var.project_id +} + +locals { + + additional_disks = [ + for ad in var.additional_disks : { + disk_name = ad.disk_name + device_name = ad.device_name + disk_type = ad.disk_type + disk_size_gb = ad.disk_size_gb + disk_labels = merge(ad.disk_labels, local.labels) + auto_delete = ad.auto_delete + boot = ad.boot + } + ] + + + login_node = { + disk_auto_delete = var.disk_auto_delete + disk_labels = merge(var.disk_labels, local.labels) + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + additional_disks = local.additional_disks + + can_ip_forward = var.can_ip_forward + disable_smt = var.disable_smt + + enable_confidential_vm = var.enable_confidential_vm + enable_public_ip = !var.disable_login_public_ips + enable_oslogin = var.enable_oslogin + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config + + gpu = one(local.guest_accelerator) + group_name = var.group_name + instance_template = var.instance_template + labels = local.labels + machine_type = var.machine_type + metadata = var.metadata + min_cpu_platform = var.min_cpu_platform + num_instances = var.num_instances + on_host_maintenance = var.on_host_maintenance + preemptible = var.preemptible + region = var.region + zone = var.zone + + service_account = coalesce(var.service_account, { + email = data.google_compute_default_service_account.default.email + scopes = ["https://www.googleapis.com/auth/cloud-platform"] + }) + + source_image_family = local.source_image_family # requires source_image_logic.tf + source_image_project = local.source_image_project_normalized # requires source_image_logic.tf + source_image = local.source_image # requires source_image_logic.tf + + static_ips = var.static_ips + bandwidth_tier = var.bandwidth_tier + + subnetwork_project = var.subnetwork_project + subnetwork = var.subnetwork_self_link + + tags = var.tags + } +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/outputs.tf new file mode 100644 index 0000000000..e700542794 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/outputs.tf @@ -0,0 +1,18 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "login_nodes" { + description = "Slurm login instance definition." + value = [local.login_node] +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf new file mode 100644 index 0000000000..8759a268cc --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -0,0 +1,73 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # Currently supported images and projects + known_project_families = { + schedmd-slurm-public = [ + "slurm-gcp-6-1-debian-11", + "slurm-gcp-6-1-hpc-rocky-linux-8", + "slurm-gcp-6-1-ubuntu-2004-lts", + "slurm-gcp-6-1-ubuntu-2204-lts-arm64", + "slurm-gcp-6-1-hpc-centos-7-k80", + "slurm-gcp-6-1-hpc-centos-7" + ] + } + + # This approach to "hacking" the project name allows a chain of Terraform + # calls to set the instance source_image (boot disk) with a "relative + # resource name" that passes muster with VPC Service Control rules + # + # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 + # https://cloud.google.com/apis/design/resource_names#relative_resource_name + source_image_project_normalized = (can(var.instance_image.family) ? + "projects/${data.google_compute_image.slurm.project}/global/images/family" : + "projects/${data.google_compute_image.slurm.project}/global/images" + ) + source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" + source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" +} + +data "google_compute_image" "slurm" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = var.instance_image.project + + lifecycle { + precondition { + condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 + error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." + } + + postcondition { + condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) + error_message = <<-EOD + Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. + EOD + } + postcondition { + condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) + error_message = <<-EOD + Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: + ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} + EOD + } + postcondition { + condition = var.disk_size_gb >= self.disk_size_gb + error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" + } + } +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf new file mode 100644 index 0000000000..1b9890e315 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -0,0 +1,326 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +variable "project_id" { + type = string + description = "Project ID to create resources in." +} + +variable "region" { + type = string + description = "Region where the instances should be created." + default = null +} + +variable "zone" { + type = string + description = <<-EOD + Zone where the instances should be created. If not specified, instances will be + spread across available zones in the region. + EOD + default = null +} + +variable "group_name" { + type = string + description = "Name of the login nodes group." +} + +variable "num_instances" { + type = number + description = "Number of instances to create. This value is ignored if static_ips is provided." + default = 1 +} + +variable "disk_type" { + type = string + description = "Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme." + default = "pd-ssd" + + validation { + condition = contains(["pd-ssd", "pd-standard", "pd-balanced", "pd-extreme"], var.disk_type) + error_message = "Variable disk_type must be one of pd-ssd, pd-standard, pd-balanced, or pd-extreme." + } +} + +variable "disk_size_gb" { + type = number + description = "Boot disk size in GB." + default = 50 +} + +variable "disk_auto_delete" { + type = bool + description = "Whether or not the boot disk should be auto-deleted." + default = true +} + +variable "disk_labels" { + description = "Labels specific to the boot disk. These will be merged with var.labels." + type = map(string) + default = {} +} + +variable "additional_disks" { + type = list(object({ + disk_name = string + device_name = string + disk_type = string + disk_size_gb = number + disk_labels = map(string) + auto_delete = bool + boot = bool + })) + description = "List of maps of disks." + default = [] +} + +variable "disable_smt" { + type = bool + description = "Disables Simultaneous Multi-Threading (SMT) on instance." + default = true +} + +variable "static_ips" { + type = list(string) + description = "List of static IPs for VM instances." + default = [] + validation { + condition = length(var.static_ips) <= 1 + error_message = "The Slurm modules supports 0 or 1 static IPs on controller instance." + } +} + +variable "bandwidth_tier" { + description = < [condor\_version](#input\_condor\_version) | Yum/DNF-compatible version string; leave unset to default to 10.x series (examples: "10.5.1","10.*")) | `string` | `"10.*"` | no | +| [condor\_version](#input\_condor\_version) | Yum/DNF-compatible version string; leave unset to use latest 23.0 LTS release (examples: "23.0.0","23.*")) | `string` | `"23.*"` | no | | [enable\_docker](#input\_enable\_docker) | Install and enable docker daemon alongside HTCondor | `bool` | `true` | no | ## Outputs diff --git a/community/modules/scripts/htcondor-install/files/install-htcondor.yaml b/community/modules/scripts/htcondor-install/files/install-htcondor.yaml index 7aecf62b9a..d62ec98a9b 100644 --- a/community/modules/scripts/htcondor-install/files/install-htcondor.yaml +++ b/community/modules/scripts/htcondor-install/files/install-htcondor.yaml @@ -13,7 +13,7 @@ # limitations under the License. # The instructions for installing HTCondor may change with time, although we -# anticipate that they will stay fixed for the 10.x releases. Find up-to-date +# anticipate that they will stay fixed for the 23.0 releases. Find up-to-date # recommendations at: ## https://htcondor.readthedocs.io/en/latest/getting-htcondor/from-our-repositories.html @@ -22,6 +22,8 @@ hosts: all vars: enable_docker: true + htcondor_key: https://research.cs.wisc.edu/htcondor/repo/keys/HTCondor-23.0-Key + docker_key: https://download.docker.com/linux/centos/gpg become: true module_defaults: ansible.builtin.yum: @@ -31,29 +33,30 @@ ansible.builtin.yum: name: - epel-release + - name: Directly install RPM verification keys + ansible.builtin.rpm_key: + state: present + key: "{{ item }}" + loop: + - "{{ htcondor_key }}" + - "{{ docker_key }}" + register: key_install + retries: 10 + delay: 60 + until: key_install is success - name: Enable HTCondor Feature Release repository ansible.builtin.yum_repository: name: htcondor-feature - description: HTCondor Feature Releases (10.x.0) - file: htcondor - baseurl: https://research.cs.wisc.edu/htcondor/repo/10.x/el$releasever/$basearch/release - gpgkey: https://research.cs.wisc.edu/htcondor/repo/keys/HTCondor-10.x-Key - gpgcheck: true - repo_gpgcheck: true - priority: "90" - - name: Enable HTCondor Feature Release Updates repository - ansible.builtin.yum_repository: - name: htcondor-feature-update - description: HTCondor Feature Release Updates (10.x.y) + description: HTCondor LTS Release (23.0) file: htcondor - baseurl: https://research.cs.wisc.edu/htcondor/repo/10.x/el$releasever/$basearch/update - gpgkey: https://research.cs.wisc.edu/htcondor/repo/keys/HTCondor-10.x-Key + baseurl: https://research.cs.wisc.edu/htcondor/repo/23.0/el$releasever/$basearch/release + gpgkey: "{{ htcondor_key }}" gpgcheck: true repo_gpgcheck: true priority: "90" - name: Install HTCondor ansible.builtin.yum: - name: condor-{{ condor_version | default("10.*") | string }} + name: condor-{{ condor_version | default("23.*") | string }} state: present - name: Ensure token directory ansible.builtin.file: @@ -71,7 +74,7 @@ baseurl: https://download.docker.com/linux/centos/$releasever/$basearch/stable enabled: yes gpgcheck: yes - gpgkey: https://download.docker.com/linux/centos/gpg + gpgkey: "{{ docker_key }}" - name: Install Docker ansible.builtin.yum: name: diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 822a9c5ae8..8ec2701c3f 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -16,9 +16,9 @@ Remove-Item "$runtime_installer" # download HTCondor installer $htcondor_installer = 'C:\htcondor.msi' %{ if condor_version == "10.*" } -Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/current/condor-Windows-x64.msi -OutFile "$htcondor_installer" +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/current/condor-Windows-x64.msi -OutFile "$htcondor_installer" %{ else ~} -Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile "$htcondor_installer" +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/23.0/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile "$htcondor_installer" %{ endif ~} $args='/qn /l* condor-install-log.txt /i' $args=$args + " $htcondor_installer" diff --git a/community/modules/scripts/htcondor-install/variables.tf b/community/modules/scripts/htcondor-install/variables.tf index 0334d62d52..f54bb4c474 100644 --- a/community/modules/scripts/htcondor-install/variables.tf +++ b/community/modules/scripts/htcondor-install/variables.tf @@ -21,7 +21,17 @@ variable "enable_docker" { } variable "condor_version" { - description = "Yum/DNF-compatible version string; leave unset to default to 10.x series (examples: \"10.5.1\",\"10.*\"))" + description = "Yum/DNF-compatible version string; leave unset to use latest 23.0 LTS release (examples: \"23.0.0\",\"23.*\"))" type = string - default = "10.*" + default = "23.*" + + validation { + error_message = "var.condor_version must be set to \"23.*\" for latest 23.0 release or to a specific \"23.0.y\" release." + condition = var.condor_version == "23.*" || ( + length(split(".", var.condor_version)) == 3 && alltrue([ + for v in split(".", var.condor_version) : can(tonumber(v)) + ]) && split(".", var.condor_version)[0] == "23" + && split(".", var.condor_version)[1] == "0" + ) + } } diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index ae56747d33..989684e370 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -64,7 +64,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | +| [terraform](#requirement\_terraform) | >= 1.0.0 | | [local](#requirement\_local) | >= 2.0.0 | ## Providers @@ -75,27 +75,40 @@ limitations under the License. ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources | Name | Type | |------|------| -| [local_file.debug_file](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | +| [local_file.debug_file_ansible_execute](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | +| [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | +| [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing spack scripts. | `string` | n/a | yes | +| [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | n/a | yes | +| [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | | [log\_file](#input\_log\_file) | Log file to write output from Ramble execute steps into | `string` | `"/var/log/ramble-execute.log"` | no | -| [ramble\_path](#input\_ramble\_path) | Path to the Ramble installation | `string` | n/a | yes | -| [ramble\_runner](#input\_ramble\_runner) | Ansible based startup-script runner from a previous Ramble step
Will automatically be prepended to the script generated by this module. |
object({
type = string
content = string
destination = string
})
| `null` | no | -| [spack\_path](#input\_spack\_path) | Path to the Spack installation | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | +| [ramble\_profile\_script\_path](#input\_ramble\_profile\_script\_path) | Path to the Ramble profile.d script. Created by an instance of ramble-setup.
Can be defined explicitly, or by chaining an instance of a ramble-setup module
through a `use` setting. | `string` | n/a | yes | +| [ramble\_runner](#input\_ramble\_runner) | Runner from previous ramble-setup or ramble-execute to be chained with scripts generated by this module. |
object({
type = string
content = string
destination = string
})
| n/a | yes | +| [region](#input\_region) | Region to place bucket containing spack scripts. | `string` | n/a | yes | +| [spack\_profile\_script\_path](#input\_spack\_profile\_script\_path) | Path to the Spack profile.d script.
Can be defined explicitly, or by chaining an instance of a spack-setup module
through a `use` setting.
Defaults to /etc/profile.d/spack.sh if not set. | `string` | `"/etc/profile.d/spack.sh"` | no | ## Outputs | Name | Description | |------|-------------| +| [controller\_startup\_script](#output\_controller\_startup\_script) | Ramble startup script, duplicate for SLURM controller. | +| [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for ramble, to be reused by ramble-execute module. | +| [ramble\_profile\_script\_path](#output\_ramble\_profile\_script\_path) | Path to Ramble profile script. | | [ramble\_runner](#output\_ramble\_runner) | Runner to execute Ramble commands using an ansible playbook. The startup-script module
will automatically handle installation of ansible. | +| [spack\_profile\_script\_path](#output\_spack\_profile\_script\_path) | Path to Spack profile script. | +| [startup\_script](#output\_startup\_script) | Ramble startup script. | diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index b58efa2d04..1ac07a22cd 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -15,34 +15,56 @@ */ locals { - commands_content = var.commands == null ? "" : indent(4, yamlencode(var.commands)) + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "ramble-execute", ghpc_role = "scripts" }) +} + +locals { + commands_content = var.commands == null ? "echo 'no ramble commands provided'" : indent(4, yamlencode(var.commands)) execute_contents = templatefile( "${path.module}/templates/ramble_execute.yml.tpl", { - pre_script = ". ${var.spack_path}/share/spack/setup-env.sh && . ${var.ramble_path}/share/ramble/setup-env.sh" + pre_script = "if [ -f ${var.spack_profile_script_path} ]; then . ${var.spack_profile_script_path}; fi; . ${var.ramble_profile_script_path}" log_file = var.log_file commands = local.commands_content } ) - previous_ramble_runner_content = var.ramble_runner == null ? "" : var.ramble_runner["content"] - - runner_content = <<-EOT - ${local.previous_ramble_runner_content} - ${local.execute_contents} - EOT + data_runners = [for data_file in var.data_files : merge(data_file, { type = "data" })] execute_md5 = substr(md5(local.execute_contents), 0, 4) + execute_runner = { + type = "ansible-local" + content = local.execute_contents + destination = "ramble_execute_${local.execute_md5}.yml" + } - ramble_execute_runner = { - "type" = "ansible-local" - "content" = local.runner_content - "destination" = "ramble_execute_${local.execute_md5}.yml" + previous_runners = var.ramble_runner != null ? [var.ramble_runner] : [] + runners = concat(local.previous_runners, local.data_runners, [local.execute_runner]) + + # Destinations should be unique while also being known at time of apply + combined_unique_string = join("\n", [for runner in local.runners : runner["destination"]]) + combined_md5 = substr(md5(local.combined_unique_string), 0, 4) + combined_runner = { + type = "shell" + content = module.startup_script.startup_script + destination = "combined_install_spack_${local.combined_md5}.sh" } } -resource "local_file" "debug_file" { +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" + + labels = local.labels + project_id = var.project_id + deployment_name = var.deployment_name + region = var.region + runners = local.runners + gcs_bucket_path = var.gcs_bucket_path +} + +resource "local_file" "debug_file_ansible_execute" { content = local.execute_contents - filename = "${path.module}/execute_script.yaml" + filename = "${path.module}/debug_execute_${local.execute_md5}.yml" } diff --git a/community/modules/scripts/ramble-execute/outputs.tf b/community/modules/scripts/ramble-execute/outputs.tf index d7cea135bb..e7eb6ea1d1 100644 --- a/community/modules/scripts/ramble-execute/outputs.tf +++ b/community/modules/scripts/ramble-execute/outputs.tf @@ -14,10 +14,35 @@ * limitations under the License. */ +output "startup_script" { + description = "Ramble startup script." + value = module.startup_script.startup_script +} + +output "controller_startup_script" { + description = "Ramble startup script, duplicate for SLURM controller." + value = module.startup_script.startup_script +} + output "ramble_runner" { description = <<-EOT Runner to execute Ramble commands using an ansible playbook. The startup-script module will automatically handle installation of ansible. EOT - value = local.ramble_execute_runner + value = local.combined_runner +} + +output "gcs_bucket_path" { + description = "Bucket containing the startup scripts for ramble, to be reused by ramble-execute module." + value = var.gcs_bucket_path +} + +output "spack_profile_script_path" { + description = "Path to Spack profile script." + value = var.spack_profile_script_path +} + +output "ramble_profile_script_path" { + description = "Path to Ramble profile script." + value = var.ramble_profile_script_path } diff --git a/community/modules/scripts/ramble-execute/variables.tf b/community/modules/scripts/ramble-execute/variables.tf index f25a15716b..b58c1ff4e3 100644 --- a/community/modules/scripts/ramble-execute/variables.tf +++ b/community/modules/scripts/ramble-execute/variables.tf @@ -14,20 +14,58 @@ * limitations under the License. */ -variable "ramble_path" { - description = "Path to the Ramble installation" +variable "project_id" { + description = "Project in which the HPC deployment will be created." type = string } +variable "deployment_name" { + description = "Name of deployment, used to name bucket containing spack scripts." + type = string +} + +variable "region" { + description = "Region to place bucket containing spack scripts." + type = string +} + +variable "labels" { + description = "Key-value pairs of labels to be added to created resources." + type = map(string) +} + variable "log_file" { description = "Log file to write output from Ramble execute steps into" default = "/var/log/ramble-execute.log" type = string } -variable "spack_path" { - description = "Path to the Spack installation" - type = string +variable "data_files" { + description = <<-EOT + A list of files to be transferred prior to running commands. + It must specify one of 'source' (absolute local file path) or 'content' (string). + It must specify a 'destination' with absolute path where file should be placed. + EOT + type = list(map(string)) + default = [] + validation { + condition = alltrue([for r in var.data_files : substr(r["destination"], 0, 1) == "/"]) + error_message = "All destinations must be absolute paths and start with '/'." + } + validation { + condition = alltrue([ + for r in var.data_files : + can(r["content"]) != can(r["source"]) + ]) + error_message = "A data_file must specify either 'content' or 'source', but never both." + } + validation { + condition = alltrue([ + for r in var.data_files : + lookup(r, "content", lookup(r, "source", null)) != null + ]) + error_message = "A data_file must specify a non-null 'content' or 'source'." + } } variable "commands" { @@ -37,22 +75,35 @@ variable "commands" { } variable "ramble_runner" { - description = <<-EOT - Ansible based startup-script runner from a previous Ramble step - Will automatically be prepended to the script generated by this module. - EOT - default = null + description = "Runner from previous ramble-setup or ramble-execute to be chained with scripts generated by this module." type = object({ type = string content = string destination = string }) - validation { - condition = var.ramble_runner == null || try(var.ramble_runner["type"] == "ansible-local", false) - error_message = "Ramble runner should be of type 'ansible'." - } - validation { - condition = var.ramble_runner == null || contains(keys(var.ramble_runner), "content") - error_message = "Ramble runner should contain a 'content' key." - } +} + +variable "gcs_bucket_path" { + description = "The GCS path for storage bucket and the object, starting with `gs://`." + type = string +} + +variable "spack_profile_script_path" { + description = <<-EOT + Path to the Spack profile.d script. + Can be defined explicitly, or by chaining an instance of a spack-setup module + through a `use` setting. + Defaults to /etc/profile.d/spack.sh if not set. + EOT + type = string + default = "/etc/profile.d/spack.sh" +} + +variable "ramble_profile_script_path" { + description = <<-EOT + Path to the Ramble profile.d script. Created by an instance of ramble-setup. + Can be defined explicitly, or by chaining an instance of a ramble-setup module + through a `use` setting. + EOT + type = string } diff --git a/community/modules/scripts/ramble-execute/versions.tf b/community/modules/scripts/ramble-execute/versions.tf index 736427bb79..9b23317323 100644 --- a/community/modules/scripts/ramble-execute/versions.tf +++ b/community/modules/scripts/ramble-execute/versions.tf @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 1.0" + required_version = ">= 1.0.0" required_providers { local = { source = "hashicorp/local" diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index cc4620fe9c..0e0a0a3093 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -71,19 +71,29 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [google](#requirement\_google) | >= 4.42 | +| [local](#requirement\_local) | >= 2.0.0 | ## Providers -No providers. +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 4.42 | +| [local](#provider\_local) | >= 2.0.0 | ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources -No resources. +| Name | Type | +|------|------| +| [google_storage_bucket.bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | +| [local_file.debug_file_shell_install](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | ## Inputs @@ -92,15 +102,25 @@ No resources. | [chgrp\_group](#input\_chgrp\_group) | Group to chgrp the Ramble clone to. Default will not modify the clone. | `string` | `null` | no | | [chmod\_mode](#input\_chmod\_mode) | Mode to chmod the Ramble clone to. Defaults to null (i.e. do not modify).
For usage information see:
https://docs.ansible.com/ansible/latest/collections/ansible/builtin/file_module.html#parameter-mode | `string` | `null` | no | | [chown\_owner](#input\_chown\_owner) | Owner to chown the Ramble clone to. Default will not modify the clone. | `string` | `null` | no | +| [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [install\_dir](#input\_install\_dir) | Destination directory of installation of Ramble. | `string` | `"/apps/ramble"` | no | +| [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | +| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | +| [ramble\_profile\_script\_path](#input\_ramble\_profile\_script\_path) | Path to the Ramble profile.d script. Created by this module | `string` | `"/etc/profile.d/ramble.sh"` | no | | [ramble\_ref](#input\_ramble\_ref) | Git ref to checkout for Ramble. | `string` | `"develop"` | no | | [ramble\_url](#input\_ramble\_url) | URL for Ramble repository to clone. | `string` | `"https://github.com/GoogleCloudPlatform/ramble"` | no | +| [ramble\_virtualenv\_path](#input\_ramble\_virtualenv\_path) | Virtual environment path in which to install Ramble Python interpreter and other dependencies | `string` | `"/usr/local/ramble-python"` | no | +| [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | ## Outputs | Name | Description | |------|-------------| +| [controller\_startup\_script](#output\_controller\_startup\_script) | Ramble installation script, duplicate for SLURM controller. | +| [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for Ramble, to be reused by ramble-execute module. | | [ramble\_path](#output\_ramble\_path) | Location ramble is installed into. | +| [ramble\_profile\_script\_path](#output\_ramble\_profile\_script\_path) | Path to Ramble profile script. | | [ramble\_ref](#output\_ramble\_ref) | Git ref the ramble install is checked out to use | -| [ramble\_runner](#output\_ramble\_runner) | Runner to setup Ramble using an ansible playbook. The startup-script module
will automatically handle installation of ansible.
- id: example-startup-script
source: modules/scripts/startup-script
settings:
runners:
- $(your-ramble-id.ramble\_setup\_runner)
... | +| [ramble\_runner](#output\_ramble\_runner) | Runner to be used with startup-script module or passed to ramble-execute module.
- installs Ramble dependencies
- installs Ramble
- generates profile.d script to enable access to Ramble
This is safe to run in parallel by multiple machines. | +| [startup\_script](#output\_startup\_script) | Ramble installation script. | diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 369fd8e6fc..034b434931 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -14,16 +14,21 @@ * limitations under the License. */ +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "ramble-setup", ghpc_role = "scripts" }) +} + locals { profile_script = <<-EOF if [ -f ${var.install_dir}/share/ramble/setup-env.sh ]; then - echo "** Ramble's python virtualenv (/usr/local/ramble-python) is actiavted. Call 'deactivate' to deactivate." + test -t 1 && echo "** Ramble's python virtualenv (/usr/local/ramble-python) is activated. Call 'deactivate' to deactivate." . /usr/local/ramble-python/bin/activate . ${var.install_dir}/share/ramble/setup-env.sh fi EOF - setup_file = templatefile( + script_content = templatefile( "${path.module}/templates/ramble_setup.yml.tftpl", { sw_name = "ramble" @@ -35,24 +40,63 @@ locals { chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode finalize_setup_script = "echo 'no finalize setup script'" + profile_script_path = var.ramble_profile_script_path } ) - deps_file = templatefile( + deps_content = templatefile( "${path.module}/templates/install_ramble_deps.yml.tpl", { ramble_ref = var.ramble_ref } ) - ramble_runner_content = <<-EOT - ${local.setup_file} - ${local.deps_file} - EOT + install_ramble_deps_runner = { + "type" = "ansible-local" + "content" = local.deps_content + "destination" = "install_ramble_deps.yml" + "args" = "-e ramble_virtualenv_path=${var.ramble_virtualenv_path}" + } - ramble_setup_runner = { + install_ramble_runner = { "type" = "ansible-local" - "content" = local.ramble_runner_content - "destination" = "ramble_setup.yml" + "content" = local.script_content + "destination" = "install_ramble.yml" + } + + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 4) + bucket_name = "ramble-scripts-${local.bucket_md5}" + runners = [local.install_ramble_deps_runner, local.install_ramble_runner] + + combined_runner = { + "type" = "shell" + "content" = module.startup_script.startup_script + "destination" = "ramble-install-and-setup.sh" } + +} + +resource "google_storage_bucket" "bucket" { + project = var.project_id + name = local.bucket_name + uniform_bucket_level_access = true + location = var.region + storage_class = "REGIONAL" + labels = local.labels +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" + + labels = local.labels + project_id = var.project_id + deployment_name = var.deployment_name + region = var.region + runners = local.runners + gcs_bucket_path = "gs://${google_storage_bucket.bucket.name}" +} + +resource "local_file" "debug_file_shell_install" { + content = local.script_content + filename = "${path.module}/debug_install.yml" } diff --git a/community/modules/scripts/ramble-setup/outputs.tf b/community/modules/scripts/ramble-setup/outputs.tf index 381e5332c7..46cc6b08c9 100644 --- a/community/modules/scripts/ramble-setup/outputs.tf +++ b/community/modules/scripts/ramble-setup/outputs.tf @@ -14,18 +14,25 @@ * limitations under the License. */ +output "startup_script" { + description = "Ramble installation script." + value = module.startup_script.startup_script +} + +output "controller_startup_script" { + description = "Ramble installation script, duplicate for SLURM controller." + value = module.startup_script.startup_script +} + output "ramble_runner" { description = <<-EOT - Runner to setup Ramble using an ansible playbook. The startup-script module - will automatically handle installation of ansible. - - id: example-startup-script - source: modules/scripts/startup-script - settings: - runners: - - $(your-ramble-id.ramble_setup_runner) - ... + Runner to be used with startup-script module or passed to ramble-execute module. + - installs Ramble dependencies + - installs Ramble + - generates profile.d script to enable access to Ramble + This is safe to run in parallel by multiple machines. EOT - value = local.ramble_setup_runner + value = local.combined_runner } output "ramble_path" { @@ -37,3 +44,13 @@ output "ramble_ref" { description = "Git ref the ramble install is checked out to use" value = var.ramble_ref } + +output "gcs_bucket_path" { + description = "Bucket containing the startup scripts for Ramble, to be reused by ramble-execute module." + value = "gs://${google_storage_bucket.bucket.name}" +} + +output "ramble_profile_script_path" { + description = "Path to Ramble profile script." + value = var.ramble_profile_script_path +} diff --git a/community/modules/scripts/ramble-setup/templates/install_ramble_deps.yml.tpl b/community/modules/scripts/ramble-setup/templates/install_ramble_deps.yml.tpl index bc1b09cb1c..39a2ac2c2e 100644 --- a/community/modules/scripts/ramble-setup/templates/install_ramble_deps.yml.tpl +++ b/community/modules/scripts/ramble-setup/templates/install_ramble_deps.yml.tpl @@ -35,6 +35,7 @@ ansible.builtin.pip: name: pip>=21.3.1 virtualenv: "{{ ramble_virtualenv_path }}" + virtualenv_command: /usr/bin/python3 -m venv - name: Download ramble requirements file ansible.builtin.get_url: @@ -45,3 +46,4 @@ ansible.builtin.pip: requirements: /tmp/requirements.txt virtualenv: "{{ ramble_virtualenv_path }}" + virtualenv_command: /usr/bin/python3 -m venv diff --git a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl index d54946c7ab..2a6e1af891 100644 --- a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl +++ b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl @@ -24,6 +24,7 @@ chown_owner: ${chown_owner} chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} + profile_script_path: ${profile_script_path} tasks: - name: Print software name ansible.builtin.debug: @@ -31,7 +32,7 @@ - name: Add profile script for software ansible.builtin.copy: - dest: /etc/profile.d/{{ sw_name }}.sh + dest: "{{ profile_script_path }}" mode: '0644' content: "{{ profile_script }}" when: profile_script @@ -52,10 +53,19 @@ changed_when: lock_out.rc == 0 failed_when: false - - name: Clones into installation directory + - name: Clone branch or tag into installation directory ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} + failed_when: false + register: clone_res when: lock_out.rc == 0 + - name: Clone commit hash into installation directory + ansible.builtin.command: "{{ item }}" + with_items: + - git clone {{ git_url }} {{ install_dir }} + - git --git-dir={{ install_dir }}/.git checkout {{ git_ref }} + when: lock_out.rc == 0 and clone_res.rc != 0 + - name: Set ownership and permissions ansible.builtin.file: path: "{{ install_dir }}" diff --git a/community/modules/scripts/ramble-setup/variables.tf b/community/modules/scripts/ramble-setup/variables.tf index 6eb9b63930..7b69f6ddc6 100644 --- a/community/modules/scripts/ramble-setup/variables.tf +++ b/community/modules/scripts/ramble-setup/variables.tf @@ -14,6 +14,11 @@ * limitations under the License. */ +variable "project_id" { + description = "Project in which the HPC deployment will be created." + type = string +} + variable "install_dir" { description = "Destination directory of installation of Ramble." default = "/apps/ramble" @@ -53,3 +58,30 @@ variable "chmod_mode" { default = null type = string } + +variable "ramble_virtualenv_path" { + description = "Virtual environment path in which to install Ramble Python interpreter and other dependencies" + default = "/usr/local/ramble-python" + type = string +} + +variable "deployment_name" { + description = "Name of deployment, used to name bucket containing startup script." + type = string +} + +variable "region" { + description = "Region to place bucket containing startup script." + type = string +} + +variable "labels" { + description = "Key-value pairs of labels to be added to created resources." + type = map(string) +} + +variable "ramble_profile_script_path" { + description = "Path to the Ramble profile.d script. Created by this module" + type = string + default = "/etc/profile.d/ramble.sh" +} diff --git a/community/modules/scripts/ramble-setup/versions.tf b/community/modules/scripts/ramble-setup/versions.tf index 015078f17c..936b4a5b80 100644 --- a/community/modules/scripts/ramble-setup/versions.tf +++ b/community/modules/scripts/ramble-setup/versions.tf @@ -15,5 +15,16 @@ */ terraform { - required_version = ">= 1.0" + required_version = ">= 1.0.0" + required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.42" + } + + local = { + source = "hashicorp/local" + version = ">= 2.0.0" + } + } } diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 8c51ab2041..e78e3894bb 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -92,19 +92,19 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.0.0 | -| [local](#requirement\_local) | ~> 2.0.0 | +| [local](#requirement\_local) | >= 2.0.0 | ## Providers | Name | Version | |------|---------| -| [local](#provider\_local) | ~> 2.0.0 | +| [local](#provider\_local) | >= 2.0.0 | ## Modules | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources @@ -124,14 +124,16 @@ limitations under the License. | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing spack scripts. | `string` | n/a | yes | +| [spack\_profile\_script\_path](#input\_spack\_profile\_script\_path) | Path to the Spack profile.d script. Created by an instance of spack-setup.
Can be defined explicitly, or by chaining an instance of a spack-setup module
through a `use` setting. | `string` | n/a | yes | | [spack\_runner](#input\_spack\_runner) | Runner from previous spack-setup or spack-execute to be chained with scripts generated by this module. |
object({
type = string
content = string
destination = string
})
| n/a | yes | ## Outputs | Name | Description | |------|-------------| -| [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | +| [controller\_startup\_script](#output\_controller\_startup\_script) | Spack startup script, duplicate for SLURM controller. | | [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | +| [spack\_profile\_script\_path](#output\_spack\_profile\_script\_path) | Path to the Spack profile.d script. | | [spack\_runner](#output\_spack\_runner) | Single runner that combines scripts from this module and any previously chained spack-execute or spack-setup modules. | -| [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | +| [startup\_script](#output\_startup\_script) | Spack startup script. | diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index a7d8aa1537..5d50a43d8c 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -25,7 +25,7 @@ locals { execute_contents = templatefile( "${path.module}/templates/execute_commands.yml.tpl", { - pre_script = ". /etc/profile.d/spack.sh" + pre_script = ". ${var.spack_profile_script_path}" log_file = var.log_file commands = local.commands_content } @@ -53,7 +53,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/outputs.tf b/community/modules/scripts/spack-execute/outputs.tf index 03904bc61f..9fd7e9ae1b 100644 --- a/community/modules/scripts/spack-execute/outputs.tf +++ b/community/modules/scripts/spack-execute/outputs.tf @@ -15,12 +15,12 @@ */ output "startup_script" { - description = "Path to the Spack installation script." + description = "Spack startup script." value = module.startup_script.startup_script } output "controller_startup_script" { - description = "Path to the Spack installation script, duplicate for SLURM controller." + description = "Spack startup script, duplicate for SLURM controller." value = module.startup_script.startup_script } @@ -33,3 +33,8 @@ output "gcs_bucket_path" { description = "Bucket containing the startup scripts for spack, to be reused by spack-execute module." value = var.gcs_bucket_path } + +output "spack_profile_script_path" { + description = "Path to the Spack profile.d script." + value = var.spack_profile_script_path +} diff --git a/community/modules/scripts/spack-execute/variables.tf b/community/modules/scripts/spack-execute/variables.tf index f8a67c9526..8436680d81 100644 --- a/community/modules/scripts/spack-execute/variables.tf +++ b/community/modules/scripts/spack-execute/variables.tf @@ -87,3 +87,12 @@ variable "gcs_bucket_path" { description = "The GCS path for storage bucket and the object, starting with `gs://`." type = string } + +variable "spack_profile_script_path" { + description = <<-EOT + Path to the Spack profile.d script. Created by an instance of spack-setup. + Can be defined explicitly, or by chaining an instance of a spack-setup module + through a `use` setting. + EOT + type = string +} diff --git a/community/modules/scripts/spack-execute/versions.tf b/community/modules/scripts/spack-execute/versions.tf index c9927a3d15..09583c3d43 100644 --- a/community/modules/scripts/spack-execute/versions.tf +++ b/community/modules/scripts/spack-execute/versions.tf @@ -19,7 +19,7 @@ terraform { required_providers { local = { source = "hashicorp/local" - version = "~> 2.0.0" + version = ">= 2.0.0" } } } diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index cd3befaaad..fde659586c 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -220,7 +220,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources @@ -253,6 +253,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | | [spack\_cache\_url](#input\_spack\_cache\_url) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a build cache:
spack mirror add --scope site  gs://my-build-cache
spack buildcache keys --install --trust
List of build caches for Spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | +| [spack\_profile\_script\_path](#input\_spack\_profile\_script\_path) | Path to the Spack profile.d script. Created by this module | `string` | `"/etc/profile.d/spack.sh"` | no | | [spack\_ref](#input\_spack\_ref) | Git ref to checkout for spack. | `string` | `"v0.20.0"` | no | | [spack\_url](#input\_spack\_url) | URL to clone the spack repo from. | `string` | `"https://github.com/spack/spack"` | no | | [spack\_virtualenv\_path](#input\_spack\_virtualenv\_path) | Virtual environment path in which to install Spack Python interpreter and other dependencies | `string` | `"/usr/local/spack-python"` | no | @@ -261,9 +262,10 @@ limitations under the License. | Name | Description | |------|-------------| -| [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | +| [controller\_startup\_script](#output\_controller\_startup\_script) | Spack installation script, duplicate for SLURM controller. | | [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | | [spack\_path](#output\_spack\_path) | Path to the root of the spack installation | +| [spack\_profile\_script\_path](#output\_spack\_profile\_script\_path) | Path to the Spack profile.d script. | | [spack\_runner](#output\_spack\_runner) | Runner to be used with startup-script module or passed to spack-execute module.
- installs Spack dependencies
- installs Spack
- generates profile.d script to enable access to Spack
This is safe to run in parallel by multiple machines. Use in place of deprecated `setup_spack_runner`. | -| [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | +| [startup\_script](#output\_startup\_script) | Spack installation script. | diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index 36803c9900..1837b811c1 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -23,6 +23,7 @@ locals { profile_script = <<-EOF SPACK_PYTHON=${var.spack_virtualenv_path}/bin/python3 if [ -f ${var.install_dir}/share/spack/setup-env.sh ]; then + test -t 1 && echo "Running Spack setup, this may take a moment on first login." . ${var.install_dir}/share/spack/setup-env.sh fi EOF @@ -38,7 +39,7 @@ locals { finalize_setup_script = <<-EOF set -e - . /etc/profile.d/spack.sh + . ${var.spack_profile_script_path} spack config --scope site add 'packages:all:permissions:read:world' spack gpg init spack compiler find --scope site @@ -57,6 +58,7 @@ locals { chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode finalize_setup_script = indent(4, yamlencode(local.finalize_setup_script)) + profile_script_path = var.spack_profile_script_path } ) install_spack_deps_runner = { @@ -92,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/outputs.tf b/community/modules/scripts/spack-setup/outputs.tf index 7c5b548c9c..546cea332c 100644 --- a/community/modules/scripts/spack-setup/outputs.tf +++ b/community/modules/scripts/spack-setup/outputs.tf @@ -15,12 +15,12 @@ */ output "startup_script" { - description = "Path to the Spack installation script." + description = "Spack installation script." value = module.startup_script.startup_script } output "controller_startup_script" { - description = "Path to the Spack installation script, duplicate for SLURM controller." + description = "Spack installation script, duplicate for SLURM controller." value = module.startup_script.startup_script } @@ -44,3 +44,8 @@ output "gcs_bucket_path" { description = "Bucket containing the startup scripts for spack, to be reused by spack-execute module." value = "gs://${google_storage_bucket.bucket.name}" } + +output "spack_profile_script_path" { + description = "Path to the Spack profile.d script." + value = var.spack_profile_script_path +} diff --git a/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml b/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml index dd7d418e05..416df4017a 100644 --- a/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml +++ b/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml @@ -41,8 +41,10 @@ ansible.builtin.pip: name: pip>=21.3.1 virtualenv: "{{ spack_virtualenv_path }}" + virtualenv_command: /usr/bin/python3 -m venv - name: Add google-cloud-storage to Spack virtualenv ansible.builtin.pip: name: google-cloud-storage virtualenv: "{{ spack_virtualenv_path }}" + virtualenv_command: /usr/bin/python3 -m venv diff --git a/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl b/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl index d54946c7ab..2a6e1af891 100644 --- a/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl +++ b/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl @@ -24,6 +24,7 @@ chown_owner: ${chown_owner} chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} + profile_script_path: ${profile_script_path} tasks: - name: Print software name ansible.builtin.debug: @@ -31,7 +32,7 @@ - name: Add profile script for software ansible.builtin.copy: - dest: /etc/profile.d/{{ sw_name }}.sh + dest: "{{ profile_script_path }}" mode: '0644' content: "{{ profile_script }}" when: profile_script @@ -52,10 +53,19 @@ changed_when: lock_out.rc == 0 failed_when: false - - name: Clones into installation directory + - name: Clone branch or tag into installation directory ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} + failed_when: false + register: clone_res when: lock_out.rc == 0 + - name: Clone commit hash into installation directory + ansible.builtin.command: "{{ item }}" + with_items: + - git clone {{ git_url }} {{ install_dir }} + - git --git-dir={{ install_dir }}/.git checkout {{ git_ref }} + when: lock_out.rc == 0 and clone_res.rc != 0 + - name: Set ownership and permissions ansible.builtin.file: path: "{{ install_dir }}" diff --git a/community/modules/scripts/spack-setup/variables.tf b/community/modules/scripts/spack-setup/variables.tf index 6d8a7a3397..f8b49177c9 100644 --- a/community/modules/scripts/spack-setup/variables.tf +++ b/community/modules/scripts/spack-setup/variables.tf @@ -322,3 +322,9 @@ EOT error_message = "environments is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } + +variable "spack_profile_script_path" { + description = "Path to the Spack profile.d script. Created by this module" + type = string + default = "/etc/profile.d/spack.sh" +} diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index cd0a0b959e..8c6e5cebe5 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.25.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index b7a328a140..9cbc93beef 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.25.0" } required_version = ">= 0.14.0" diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index 3846967901..5aa72a42dd 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -281,11 +281,10 @@ deployment_groups: - bucket-output - bucket-software settings: + add_deployment_name_before_prefix: true name_prefix: chrome-remote-desktop install_nvidia_driver: true startup_script: | - sudo apt-get update - sudo apt-get -y upgrade find /user_provided_software -name vmd-1.9.*.bin.LINUXAMD64*.tar.gz -exec tar xvzf '{}' -C . \; cd vmd-1.9.*/ ./configure diff --git a/docs/vm-images.md b/docs/vm-images.md index 231fda074d..2e437fcf72 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -223,19 +223,23 @@ description of our support for Windows images. ✓ ✓ - HTCondor - ✓✓ + + ✓ + ✓ + - Omnia - ✓ + + + ✓ + diff --git a/examples/README.md b/examples/README.md index bbb02be714..aa0d183191 100644 --- a/examples/README.md +++ b/examples/README.md @@ -18,6 +18,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] + * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml-) ![community-badge] * [hpc-intel-select-slurm.yaml](#hpc-intel-select-slurmyaml-) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] @@ -534,6 +535,39 @@ For this example the following is needed in the selected region: [pfs-lustre.yaml]: ./pfs-lustre.yaml +### [cae-slurm.yaml] ![core-badge] + +The Computer Aided Engineering (CAE) blueprint captures a reference architecture +where the right cloud components are assembled to optimally cater to the +requirements of computationally-intensive CAE workloads. Specifically, it is +architected around Google Cloud’s VM families that provide a high memory bandwidth +and a balanced memory/flop ratio, which is particularly useful for per-core licensed +CAE software. The solution caters also to large CAE use cases, requiring multiple nodes +that are tightly-coupled via MPI. Special high-memory shapes support even very +memory-demanding workloads with up to 16GB/core. For file IO, different Google managed +high performance NFS storage services are available. For very IO demanding workloads, +third party parallel file systems can be integrated. The scheduling of the workloads +is done by a workload manager. + +The CAE blueprint is intended to be a starting point for more tailored explorations +or installations of specific CAE codes, as provided by ISVs separately. + +A detailed documentation is provided in this [README](cae/README.md). + +#### Quota Requirements for cae-slurm.yaml + +For this example the following is needed in the selected region: + +* Cloud Filestore API: Basic SSD capacity (GB) per region: **5,120 GB** +* Cloud Filestore API: High Scale SSD capacity (GB) per region: **10,240 GB** +* Compute Engine API: H3 CPUs: **88/node** active in `balance` partition up to 880 +* Compute Engine API: C3-highmem CPUs: **176/node** active in `highmem` partition up to 1,760 +* Compute Engine API: N1 CPUs: **8/node** active in `desktop` partition up to 40 +* Compute Engine API: T4 GPUs: **1/node** active in `desktop` partition up to 5 +* Compute Engine API: N2 CPUs: **8** for login and **16** for controller + +[cae-slurm.yaml]: ../examples/cae/cae-slurm.yaml + ### [hpc-slurm-ubuntu2004.yaml] ![community-badge] > **Warning**: The variables `enable_reconfigure`, diff --git a/examples/cae/README.md b/examples/cae/README.md new file mode 100644 index 0000000000..9dc91cf1f6 --- /dev/null +++ b/examples/cae/README.md @@ -0,0 +1,246 @@ +# Computer Aided Engineering (CAE) Reference Architecture + +The Computer Aided Engineering (CAE) [blueprint](./cae-slurm.yaml) in +this folder captures a reference architecture where the right cloud components +are assembled to optimally cater to the requirements of computationally-intensive +CAE workloads. Specifically, it is architected around Google Cloud’s VM families +that provide a high memory bandwidth and a balanced memory/flop ratio, which is +particularly useful for per-core licensed CAE software. The solution caters also +to large CAE use cases, requiring multiple nodes that are tightly-coupled via MPI. +Special high-memory shapes support even very memory-demanding workloads with up +to 16GB/core. For file IO, different Google managed high performance NFS storage +services are available. For very IO demanding workloads, third party parallel file +systems can be integrated. The scheduling of the workloads is done by a workload +manager. + +## Architecture +The CAE blueprint is intended to be a starting point for more tailored explorations +or installations of specific CAE codes, as provided by ISVs separately. + +This blueprint features a general setup suited for CAE applications on GCP +including: + +- Google's H3 VMs, ideally suited for CAE workloads +- Google's C3-highmem VM, suited for workloads with 8GB/core requirement +- Google's Filestore NFS-based shared storage +- Google's Chrome Remote Desktop +- SLURM workload scheduler + +## Getting Started +To explore the reference architecture, you should follow the these steps: + +Before you start, make sure your prerequisits and dependencies are set up: +[Set up Cloud HPC Toolkit](https://cloud.google.com/hpc-toolkit/docs/setup/configure-environment). + +For deploying the CAE reference blueprint follow the +[Deployment Instructions](#deployment-instructions). + +For pointers on how to proceed with the installation of ISV software, please see section +[Software Installation Patterns](#software-installation-patterns). + +### Deployment Stages + +This blueprint has the following deployment groups: + +- `setup`: Setup backbone infrastructure such as networking, file systems, & + monitoring. +- `software_installation`(_optional_): This deployment group is a stub for + custom software installation on the network storage before the cluster is brought up +- `cluster`: Deploys an auto-scaling cluster and remote desktop. + +Having multiple deployment groups decouples the life cycle of some +infrastructure. For example a) you can tear down the cluster while leaving the +storage intact and b) you can build software before you deploy your cluster. + +## Deployment Instructions + +> [!WARNING] +> Installing this blueprint uses the following billable components of Google +> Cloud: +> +> - Compute Engine +> - Filestore +> +> To avoid continued billing after use closely follow the +> [teardown instructions](#teardown-instructions). To generate a cost estimate based on +> your projected usage, use the [pricing calculator](https://cloud.google.com/products/calculator). +> +> [!WARNING] +> Before attempting to execute the following instructions, it is important to +> consider your project's quota. The `cae-slurm.yaml` blueprint creates an +> autoscaling cluster that, when fully scaled up, can deploy up to 10 +> `h3-standard-88` and up to 10 `c3-highmem-176` VMs. +> +> To fully scale up this cluster, the project would require quota for: +> +> - Compute Node Group +> - 88 CPUs * 10 VMs = **880 `H3 CPUs`** +> - 176 CPUs * 10 VMs = **1760 `C3 CPUs`** +> - Remote Desktop Group +> - **40 `N1 CPUs`** +> - **5 `T4 GPUs`** +> - Slurm Login & Controler VM +> - **24 `N2 CPUs`** +> - Filestore +> - **2x `Basic SSD`** +> - **1x `High Scale SSD`** +> +> However, this is merely an example sizing for an instance of this reference architecture. +> Node counts and remote desktop seats can easily be adjusted in the blueprint. + +1. Clone the repo + + ```bash + git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git + cd hpc-toolkit + ``` + +1. Build the HPC Toolkit + + ```bash + make + ``` + +1. Generate the deployment folder after replacing `` with the project + id. + + ```bash + ./ghpc create community/examples/cae-slurm.yaml -w --vars project_id= + ``` + +1. Deploy the `setup` group + + Call the following ghpc command to deploy the cae-slurm blueprint. + + ```bash + ./ghpc deploy cae-slurm + ``` + + The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + **continue** without applying the `setup` group. Select 'apply'. + + This group will create a network and file systems to be used by the cluster. + + > [!WARNING] + > This ghpc command will run through 2 deployment groups (3 if you populate + > & activate the `software_installation` stage) and prompt you to apply each one. + > If the command is cancelled or exited by accident before finishing, it can + > be rerun to continue deploying the blueprint. + +1. Deploy the `software_installation` group (_optional_). + + > [!NOTE] + > Installation processes differ between applications. Some come as a + > precompiled binary with all dependencies included, others may need to + > be built from source, while others can be deployed through package + > managers such as spack. This deployment group is intended to be used + > if the software installation process requires substantial amount of time (e.g. + > compilation from source). By building the software in a separate + > deployment group, this process can be done before the cluster is + > up, minimizing costs. + > + > [!NOTE] + > By default, this deployment group is disabled in the reference design. See + > [Software Installation Patterns](#software-installation-patterns) for more information. + + If this deployment group is used (needs to be uncomment in the blueprint first), + you can return to the ghpc command which will ask you to **display**, **apply**, + **stop**, or **continue** without applying the `software_installation` group. + Select 'apply'. + +1. Deploy the `cluster` group + + The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + **continue** without applying the `cluster` group. Select 'apply'. + + This deployment group contains the Slurm cluster and with compute partitions + and a partition for Chrome remote desktop visualization nodes. + +1. Set up Chrome Remote Desktop + + One or multiple Chrome Remote Desktop (CRD) sessions can be started dynamically + through Slurm. + + - Follow + [the instructions](../README.md#hpc-slurm-chromedesktopyaml--) + for setting up the Remote Desktop. + +## Teardown Instructions + +> [!NOTE] +> If you created a new project for testing of the CAE solution, the easiest way to +> eliminate billing is to delete the project. + +When you would like to tear down the deployment, each stage must be destroyed. +Since the `software_installation` and `cluster` depend on the network deployed +in the `setup` stage, they must be destroyed first. You can use the following +commands to destroy the deployment in this reverse order. You will be prompted +to confirm the deletion of each stage. + +```bash +./ghpc destroy cae-slurm +``` + +> [!WARNING] +> If you do not destroy all three deployment groups then there may be continued +> associated costs. + +## Software Installation Patterns + +This section is intended to illustrate how software can be installed in the context +of the CAE reference solution. + +Depending on the software you want to use, different installation paths may be required. + +- **Installation with binary** + Commercial-off-the-shelf CAE applications typically come with + precompiled binaries which are provided by the ISV. + + See the tutorials for + [Ansys Fluent](https://cloud.google.com/hpc-toolkit/docs/tutorials/ansys-fluent#install_ansys_fluent) + and for [Siemens Simcenter STAR-CCM+](https://cloud.google.com/hpc-toolkit/docs/simcenter-star-ccm/run-workload#configure_the_vm) + that illustrate this process. + + In general, you need to bring the binaries to your CAE cluster for which it is + useful to use a Google Clouds Storage bucket, which is accessible from any machine using the + gsutil command and which can be mounted in the cluster. + + As this installation process only needs to be done once and at the same time may require time, + we recommend to do this installation in a separate deployment group before you bring up the cluster. + The `software_installation' stage is meant to accommodate this. You can for example bring up + a dedicated VM + + ``` {.yaml} + - id: sw-installer-vm + source: modules/compute/vm-instance + use: [network1, appsfs] + settings: + name_prefix: sw-installer + add_deployment_name_before_prefix: true + threads_per_core: 2 + machine_type: c2-standard-16 + ``` + + where you can follow the installation steps manually. Or using the toolkit's + [startup-script](../../modules/scripts/startup-scripts/README.md) module, the process + can be automated. + + Once that is completed, the software will persist on the NFS Filestore share for as long as you + do not destroy the `setup` stage. + +- **Installation from source/with package manager** + For open source software, you may want to compile the software from scratch or use a + package manager such as spack for the installation. This process typically takes + a non-negligible amount of time (~hours). We therefore strongly suggest to use + the `software_installation` stage for this purpose. + + Please see the [HCLS Blueprint](../../docs/videos/healthcare-and-life-sciences/README.md) example + for how the `software_installation` stage can be used to use the spack package manager + to install all dependencies for a particular version of the software, including compiling + the software or its depencies from source. + + Please also see the [OpenFOAM](../../docs/tutorials/openfoam/spack-openfoam.md) example + for how this can be used to install the OpenFOAM software. + + Once that is completed, the software will persist on the NFS Filestore share for as long as you + do not destroy the `setup` stage. diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml new file mode 100644 index 0000000000..9b4c3b6f42 --- /dev/null +++ b/examples/cae/cae-slurm.yaml @@ -0,0 +1,254 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# +# **************** +####### CAE Solution Blueprint ####### +# **************** +# +# This blueprint features a reference design suited for CAE applications on GCP. +# It sets up the following infrastructure: +# * Google's H3 VMs, ideally suited for CAE workloads +# * Google's C3-highmem VM, suited for workloads with 16GB/core requirement +# * Google's Filestore NFS-based shared storage +# * Google's Chrome Remote Desktop +# * SLURM workload scheduler +# +blueprint_name: cae-slurm +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: cae-slurm + # check here for other regions with H3 deployments: https://cloud.google.com/compute/docs/regions-zones + # For example + # region: europe-west4 + # zone: europe-west4-b + region: us-central1 + zone: us-central1-a + # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family + # for a list of valid family options with Slurm; note: the image types for the compute nodes + # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. + instance_image: + family: slurm-gcp-5-9-hpc-centos-7 + project: schedmd-slurm-public + crd_instance_image: + family: slurm-gcp-5-9-debian-11 # must be Debian for CRD + project: schedmd-slurm-public + +# Documentation for each of the modules used below can be found at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md +deployment_groups: + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# +# +# Deployment Group: Setup +# +# Sets up VPC network, persistent NFS shares, dashboard +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +- group: setup + modules: + + ####### Virtual Private Cloud Setup ####### + # This creates a virtual private network for your cloud setup + - id: network1 + source: modules/network/vpc + settings: + network_name: cae-slurm-net + subnetwork_name: primary-subnet + + ####### User Home Storage ####### + # This block creates an NFS file share for /home + - id: homefs + source: modules/file-system/filestore + use: [network1] + settings: + filestore_tier: BASIC_SSD + size_gb: 2560 + filestore_share_name: homeshare + local_mount: /home + + ####### Shared Software Storage ####### + # This block creates NFS file share for shared software installations + - id: appsfs + source: modules/file-system/filestore + use: [network1] + settings: + filestore_tier: BASIC_SSD + size_gb: 2560 + filestore_share_name: appsshare + local_mount: /apps + + ####### Dashboard ####### + # This module activates integration with a dashboard on the Google Cloud Console + - id: hpc_dash + source: modules/monitoring/dashboard + outputs: [instructions] + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# +# +# Deployment Group: Software Installation +# +# This deployment group is a stub for installing software before +# bringing up the actual cluster. +# See the README.md for useful software deployment patterns. +# +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# - group: software_installation +# modules: + +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# +# +# Deployment Group: Cluster +# +# Provisions the actual CAE cluster with compute partitions, +# remote desktop partition and connects to the previously set up +# NFS shares. +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +- group: cluster + modules: + + ####### Scratch ####### + # This block creates an NFS file share for scratch. If you experience an IO bottleneck, + # consider to use the more performant version HIGH_SCALE_SSD with the following settings: + - id: scratchfs + source: modules/file-system/filestore + use: [network1] + settings: + filestore_tier: HIGH_SCALE_SSD + size_gb: 10240 # smallest size for HIGH_SSD_SCALE + filestore_share_name: scratchshare + local_mount: /scratch + + # If you require maximum IO performance, you can consider to bring up a dedicated parallel + # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Intel DAOS. + # Note: Those solutions may have associated license cost. + # + # Please visit here for more information + # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md + # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale + # - Intel DAOS: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/Intel-DAOS/README.md + + ######## Remote Desktop(s) ####### + # This block enables a partition for nodes that support Chrome Remote Desktop + # see here for use: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#hpc-slurm-chromedesktopyaml-- + - id: remotedesktop + source: community/modules/remote-desktop/chrome-remote-desktop + use: [network1] + settings: + install_nvidia_driver: true + # instance_count: 0 will create installation scripts only + # which can be used with slurm node provisioning + instance_count: 0 + + - id: remotedesktop_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + machine_type: n1-standard-8 + node_count_dynamic_max: 5 + instance_image: $(vars.crd_instance_image) + guest_accelerator: + - type: nvidia-tesla-t4-vws + count: 1 + + - id: remotedesktop_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - appsfs + - scratchfs + - remotedesktop + - remotedesktop_node_group + settings: + partition_name: desktop + enable_placement: false + partition_startup_scripts_timeout: 900 + + ####### Balanced partition ####### + # this block creates a partition uses GCP H3-standard VM for regular jobs with 4GB/core + - id: h3_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 10 + machine_type: h3-standard-88 + disk_type: 'pd-balanced' + bandwidth_tier: 'gvnic_enabled' + + - id: h3_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - appsfs + - scratchfs + - h3_node_group + settings: + partition_name: balance + is_default: true + + ####### High-Mem partition ####### + # this block creates partition uses GCP C3-highmem VM for jobs with 16GB/core requirement + - id: c3_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 10 + machine_type: c3-highmem-176 + disk_type: 'pd-balanced' + bandwidth_tier: 'tier_1_enabled' + + - id: c3_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - appsfs + - scratchfs + - c3_node_group + settings: + partition_name: highmem + + ####### Scheduler: SLURM ####### + # This block creates a SLURM controller + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + - homefs + - appsfs + - scratchfs + - h3_partition + - c3_partition + - remotedesktop_partition + settings: + machine_type: n2-standard-16 + compute_startup_scripts_timeout: 900 + cloud_parameters: + resume_rate: 0 + resume_timeout: 900 + suspend_rate: 0 + suspend_timeout: 300 + no_comma_params: false + + ####### Scheduler: SLURM ####### + # This block creates a SLURM login node + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + settings: + machine_type: n2-standard-8 diff --git a/go.mod b/go.mod index 3f7449a5bd..0b2c5b814e 100644 --- a/go.mod +++ b/go.mod @@ -1,32 +1,32 @@ module hpc-toolkit -go 1.18 +go 1.20 require ( cloud.google.com/go/compute v1.23.0 // indirect cloud.google.com/go/storage v1.30.1 // indirect - github.com/go-git/go-git/v5 v5.9.0 - github.com/hashicorp/go-getter v1.7.2 + github.com/go-git/go-git/v5 v5.10.0 + github.com/hashicorp/go-getter v1.7.3 github.com/hashicorp/hcl v1.0.0 // indirect - github.com/hashicorp/hcl/v2 v2.18.0 + github.com/hashicorp/hcl/v2 v2.19.1 github.com/hashicorp/terraform-config-inspect v0.0.0-20221020162138-81db043ad408 github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.10.0 github.com/spf13/cobra v1.7.0 - github.com/zclconf/go-cty v1.14.0 + github.com/zclconf/go-cty v1.14.1 golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a - google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb // indirect + google.golang.org/genproto v0.0.0-20231002182017-d307bd883b97 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) require ( github.com/go-git/go-billy/v5 v5.5.0 - github.com/google/go-cmp v0.5.9 + github.com/google/go-cmp v0.6.0 github.com/hashicorp/terraform-exec v0.19.0 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.144.0 + google.golang.org/api v0.148.0 ) require ( @@ -37,16 +37,16 @@ require ( github.com/hashicorp/terraform-json v0.17.1 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect golang.org/x/mod v0.12.0 // indirect - golang.org/x/sync v0.3.0 // indirect + golang.org/x/sync v0.4.0 // indirect golang.org/x/tools v0.13.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20231002182017-d307bd883b97 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20231012201019-e917dd12ba7a // indirect ) require ( - cloud.google.com/go v0.110.7 // indirect + cloud.google.com/go v0.110.8 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect - cloud.google.com/go/iam v1.1.1 // indirect + cloud.google.com/go/iam v1.1.2 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 // indirect github.com/acomagu/bufpipe v1.0.4 // indirect @@ -81,14 +81,14 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.13.0 // indirect - golang.org/x/net v0.15.0 // indirect - golang.org/x/oauth2 v0.12.0 // indirect - golang.org/x/sys v0.12.0 + golang.org/x/crypto v0.14.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/oauth2 v0.13.0 // indirect + golang.org/x/sys v0.13.0 golang.org/x/text v0.13.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/grpc v1.58.2 // indirect + google.golang.org/grpc v1.58.3 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 64ebd9d7af..ee7a28fa62 100644 --- a/go.sum +++ b/go.sum @@ -32,8 +32,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.110.7 h1:rJyC7nWRg2jWGZ4wSJ5nY65GTdYJkg0cd/uXb+ACI6o= -cloud.google.com/go v0.110.7/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= +cloud.google.com/go v0.110.8 h1:tyNdfIxjzaWctIiLYOTalaLKZ17SI44SKFW26QbOhME= +cloud.google.com/go v0.110.8/go.mod h1:Iz8AkXJf1qmxC3Oxoep8R1T36w8B92yU29PcBhHO5fk= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.1 h1:lW7fzj15aVIXYHREOqjRBV9PsH0Z6u8Y46a1YGvQP4Y= -cloud.google.com/go/iam v1.1.1/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= +cloud.google.com/go/iam v1.1.2 h1:gacbrBdWcoVmGLozRuStX45YKvJtzIjJdAolzUs1sm4= +cloud.google.com/go/iam v1.1.2/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -259,9 +259,9 @@ github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66D github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic= github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+mTU= github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= -github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20230305113008-0c11038e723f h1:Pz0DHeFij3XFhoBRGUDPzSJ+w2UcK5/0JvF8DRI58r8= -github.com/go-git/go-git/v5 v5.9.0 h1:cD9SFA7sHVRdJ7AYck1ZaAa/yeuBvGPxwXDL8cxrObY= -github.com/go-git/go-git/v5 v5.9.0/go.mod h1:RKIqga24sWdMGZF+1Ekv9kylsDz6LzdTSI2s/OsZWE0= +github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= +github.com/go-git/go-git/v5 v5.10.0 h1:F0x3xXrAWmhwtzoCokU4IMPcBdncG+HAAqi9FcOOjbQ= +github.com/go-git/go-git/v5 v5.10.0/go.mod h1:1FOZ/pQnqw24ghP2n7cunVl0ON55BsjPYvhWHvZGhoo= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= @@ -318,8 +318,9 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= @@ -369,8 +370,8 @@ github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= -github.com/hashicorp/go-getter v1.7.2 h1:uJDtyXwEfalmp1PqdxuhZqrNkUyClZAhVeZYTArbqkg= -github.com/hashicorp/go-getter v1.7.2/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= +github.com/hashicorp/go-getter v1.7.3 h1:bN2+Fw9XPFvOCjB0UOevFIMICZ7G2XSQHzfvLUyOM5E= +github.com/hashicorp/go-getter v1.7.3/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= @@ -380,8 +381,8 @@ github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ github.com/hashicorp/hc-install v0.6.0 h1:fDHnU7JNFNSQebVKYhHZ0va1bC6SrPQ8fpebsvNr2w4= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/hcl/v2 v2.18.0 h1:wYnG7Lt31t2zYkcquwgKo6MWXzRUDIeIVU5naZwHLl8= -github.com/hashicorp/hcl/v2 v2.18.0/go.mod h1:ThLC89FV4p9MPW804KVbe/cEXoQ8NZEh+JtMeeGErHE= +github.com/hashicorp/hcl/v2 v2.19.1 h1://i05Jqznmb2EXqa39Nsvyan2o5XyMowW5fnCKW5RPI= +github.com/hashicorp/hcl/v2 v2.19.1/go.mod h1:ThLC89FV4p9MPW804KVbe/cEXoQ8NZEh+JtMeeGErHE= github.com/hashicorp/terraform-config-inspect v0.0.0-20221020162138-81db043ad408 h1:dol/gV6vq/QBI1lGTxUEUGr8ixcs4SU79lgCoRMg3pU= github.com/hashicorp/terraform-config-inspect v0.0.0-20221020162138-81db043ad408/go.mod h1:EAaqp5h9PsUNr6NtgLj31w+ElcCEL+1Svw1Jw+MTVKU= github.com/hashicorp/terraform-exec v0.19.0 h1:FpqZ6n50Tk95mItTSS9BjeOVUb4eg81SpgVtZNNtFSM= @@ -482,8 +483,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/zclconf/go-cty v1.2.0/go.mod h1:hOPWgoHbaTUnI5k4D2ld+GRpFJSCe6bCM7m1q/N4PQ8= -github.com/zclconf/go-cty v1.14.0 h1:/Xrd39K7DXbHzlisFP9c4pHao4yyf+/Ug9LEz+Y/yhc= -github.com/zclconf/go-cty v1.14.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.14.1 h1:t9fyA35fwjjUMcmL5hLER+e/rEPqrbCK1/OSE4SI9KA= +github.com/zclconf/go-cty v1.14.1/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b h1:FosyBZYxY34Wul7O/MSKey3txpPYyCqVO5ZyceuQJEI= github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b/go.mod h1:ZRKQfBXbGkpdV6QMzT3rU1kSTAnfu1dO8dPKjYprgj8= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= @@ -507,8 +508,8 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.13.0 h1:mvySKfSWJ+UKUii46M40LOvyWfN0s2U+46/jDd0e6Ck= -golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -603,8 +604,8 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= -golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -630,8 +631,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4= -golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4= +golang.org/x/oauth2 v0.13.0 h1:jDDenyj+WgFtmV3zYVoi8aE2BwtXFLWOA67ZfNWftiY= +golang.org/x/oauth2 v0.13.0/go.mod h1:/JMhi4ZRXAf4HG9LiNmxvk+45+96RUlVThiH8FzNBn0= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -647,8 +648,8 @@ golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= -golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= +golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -721,15 +722,15 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.12.0 h1:/ZfYdc3zq+q02Rv9vGqTeSItdzZTSNDmfTi0mBAuidU= +golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -860,8 +861,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.144.0 h1:01xgplvIwdMpnrlenPHMgRAAgAH9N5Zv21Qu6XwJxSU= -google.golang.org/api v0.144.0/go.mod h1:OARJqIfoYjXJj4C1AiBSXYZt03qsoz8FQYU6fBEfrHM= +google.golang.org/api v0.148.0 h1:HBq4TZlN4/1pNcu0geJZ/Q50vIwIXT532UIMYoo0vOs= +google.golang.org/api v0.148.0/go.mod h1:8/TBgwaKjfqTdacOJrOv2+2Q6fBDU1uHKK06oGSkxzU= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -973,12 +974,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb h1:XFBgcDwm7irdHTbz4Zk2h7Mh+eis4nfJEFQFYzJzuIA= -google.golang.org/genproto v0.0.0-20230913181813-007df8e322eb/go.mod h1:yZTlhN0tQnXo3h00fuXNCxJdLdIdnVFVBaRJ5LWBbw4= -google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb h1:lK0oleSc7IQsUxO3U5TjL9DWlsxpEBemh+zpB7IqhWI= -google.golang.org/genproto/googleapis/api v0.0.0-20230913181813-007df8e322eb/go.mod h1:KjSP20unUpOx5kyQUFa7k4OJg0qeJ7DEZflGDu2p6Bk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13 h1:N3bU/SQDCDyD6R528GJ/PwW9KjYcJA3dgyH+MovAkIM= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230920204549-e6e6cdab5c13/go.mod h1:KSqppvjFjtoCI+KGd4PELB0qLNxdJHRGqRI09mB6pQA= +google.golang.org/genproto v0.0.0-20231002182017-d307bd883b97 h1:SeZZZx0cP0fqUyA+oRzP9k7cSwJlvDFiROO72uwD6i0= +google.golang.org/genproto v0.0.0-20231002182017-d307bd883b97/go.mod h1:t1VqOqqvce95G3hIDCT5FeO3YUc6Q4Oe24L/+rNMxRk= +google.golang.org/genproto/googleapis/api v0.0.0-20231002182017-d307bd883b97 h1:W18sezcAYs+3tDZX4F80yctqa12jcP1PUS2gQu1zTPU= +google.golang.org/genproto/googleapis/api v0.0.0-20231002182017-d307bd883b97/go.mod h1:iargEX0SFPm3xcfMI0d1domjg0ZF4Aa0p2awqyxhvF0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231012201019-e917dd12ba7a h1:a2MQQVoTo96JC9PMGtGBymLp7+/RzpFc2yX/9WfFg1c= +google.golang.org/genproto/googleapis/rpc v0.0.0-20231012201019-e917dd12ba7a/go.mod h1:4cYg8o5yUbm77w8ZX00LhMVNl/YVBFJRYWDc0uYWMs0= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1014,8 +1015,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.58.2 h1:SXUpjxeVF3FKrTYQI4f4KvbGD5u2xccdYdurwowix5I= -google.golang.org/grpc v1.58.2/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= +google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= +google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/modules/README.md b/modules/README.md index b02cb65600..fbd4ef8964 100644 --- a/modules/README.md +++ b/modules/README.md @@ -39,6 +39,12 @@ Modules that are still in development and less stable are labeled with the Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v5-controller]. * **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] : Creates a node group to be used by the [schedmd-slurm-gcp-v5-partition] module. +* **[schedmd-slurm-gcp-v6-partition]** ![community-badge] ![experimental-badge]: + Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v6-controller]. +* **[schedmd-slurm-gcp-v6-nodeset]** ![community-badge] ![experimental-badge]: + Creates a nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module. +* **[schedmd-slurm-gcp-v6-nodeset-tpu]** ![community-badge] ![experimental-badge]: + Creates a TPU nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module. * **[gke-node-pool]** ![community-badge] ![experimental-badge] : Creates a Kubernetes node pool using GKE. * **[gke-job-template]** ![community-badge] ![experimental-badge] : Creates a @@ -57,6 +63,9 @@ Modules that are still in development and less stable are labeled with the [schedmd-slurm-on-gcp-partition]: ../community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md [schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +[schedmd-slurm-gcp-v6-partition]: ../community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +[schedmd-slurm-gcp-v6-nodeset]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +[schedmd-slurm-gcp-v6-nodeset-tpu]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md [htcondor-execute-point]: ../community/modules/compute/htcondor-execute-point/README.md [pbspro-execution]: ../community/modules/compute/pbspro-execution/README.md @@ -153,6 +162,10 @@ Modules that are still in development and less stable are labeled with the Creates a Slurm login node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] : Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. +* **[schedmd-slurm-gcp-v6-controller]** ![community-badge] ![experimental-badge]: + Creates a Slurm controller node using [slurm-gcp-version-6]. +* **[schedmd-slurm-gcp-v6-login]** ![community-badge] ![experimental-badge]: + Creates a Slurm login node using [slurm-gcp-version-6]. * **[htcondor-setup]** ![community-badge] ![experimental-badge] : Creates the base infrastructure for an HTCondor pool (service accounts and Cloud Storage bucket). * **[htcondor-pool-secrets]** ![community-badge] ![experimental-badge] : Creates @@ -176,6 +189,8 @@ Modules that are still in development and less stable are labeled with the [htcondor-setup]: ../community/modules/scheduler/htcondor-setup/README.md [htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md +[schedmd-slurm-gcp-v6-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +[schedmd-slurm-gcp-v6-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -183,6 +198,7 @@ Modules that are still in development and less stable are labeled with the [schedmd-slurm-on-gcp-login-node]: ../community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md [slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v4.2.1 [slurm-gcp-version-5]: https://github.com/SchedMD/slurm-gcp/tree/5.9.1 +[slurm-gcp-version-6]: https://github.com/SchedMD/slurm-gcp/tree/6.2.0 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md @@ -259,11 +275,15 @@ repository: * Filesystem paths * modules embedded in the `ghpc` executable * modules in the local filesystem -* Remote modules hosted on github.com or any `git::` repository - * when modules are in a subdirectory of the git repository, a special - double-slash "//" notation can be required as described below - -An important distinction is that git URLs are natively supported by Terraform so +* Remote modules using [Terraform URL syntax](https://developer.hashicorp.com/terraform/language/modules/sources) + * Hosted on [GitHub](https://developer.hashicorp.com/terraform/language/modules/sources#github) + * Google Cloud Storage [Buckets](https://developer.hashicorp.com/terraform/language/modules/sources#gcs-bucket) + * Generic [git repositories](https://developer.hashicorp.com/terraform/language/modules/sources#generic-git-repository) + + when modules are in a subdirectory of the git repository, a special + double-slash `//` notation can be required as described below + +An important distinction is that those URLs are natively supported by Terraform so they are not copied to your deployment directory. Packer does not have native support for git-hosted modules so the Toolkit will copy these modules into the deployment folder on your behalf. @@ -395,6 +415,13 @@ are supported, `git::https://` for HTTPS or `git::git@github.com` for SSH. Additional formatting and features after `git::` are identical to that of the [GitHub Modules](#github-modules) described above. +##### Google Cloud Storage Modules + +To use a Terraform module available in a Google Cloud Storage bucket, set the source +to a URL with the special `gcs::` prefix, followed by a [GCS bucket object URL](https://cloud.google.com/storage/docs/request-endpoints#typical). + +For example: `gcs::https://www.googleapis.com/storage/v1/BUCKET_NAME/PATH_TO_MODULE` + ### Kind (May be Required) `kind` refers to the way in which a module is deployed. Currently, `kind` can be diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 57de619c7a..799b3953c7 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -83,19 +83,20 @@ Use the following settings for compact placement: collocation: "COLLOCATED" ``` -When `vm_count` is not set, as shown in the example above, then the VMs will be -added to the placement policy incrementally. This is the **recommended way** to -use placement policies. - -If `vm_count` is specified then VMs will stay in pending state until the -specified number of VMs are created. See the warning below if using this field. +By default the above placement policy will always result in the most compact set +of VMs available. If you would like that provisioning failed if some level of +compactness is not obtainable, you can enforce this with the [`max_distance` +setting](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies): -> **Warning** When creating a compact placement with more than 10 VMs, you must -> add `-parallelism=` argument on apply. For example if you have 15 VMs in a -> placement group: `terraform apply -parallelism=15`. This is because terraform -> self limits to 10 parallel requests by default but the create instance -> requests will not succeed until all VMs in the placement group have been -> requested, forming a deadlock. +```yaml + ... + settings: + instance_count: 4 + machine_type: c2-standard-60 + placement_policy: + collocation: "COLLOCATED" + max_distance: 1 +``` Use the following settings for spread placement: @@ -108,10 +109,20 @@ Use the following settings for spread placement: availability_domain_count: 2 ``` -> **_NOTE:_** Due to -> [this open issue](https://github.com/hashicorp/terraform-provider-google/issues/11483), -> it may be required to specify the `vm_count`. Once this issue is resolved, -> `vm_count` will no longer be mandatory. +When `vm_count` is not set, as shown in the examples above, then the VMs will be +added to the placement policy incrementally. This is the **recommended way** to +use placement policies. + +If `vm_count` is specified then VMs will stay in pending state until the +specified number of VMs are created. See the warning below if using this field. + +> [!WARNING] +> When creating a compact placement using `vm_count` with more than 10 VMs, you +> must add `-parallelism=` argument on apply. For example if you have 15 VMs +> in a placement group: `terraform apply -parallelism=15`. This is because +> terraform self limits to 10 parallel requests by default but the create +> instance requests will not succeed until all VMs in the placement group have +> been requested, forming a deadlock. ### GPU Support @@ -174,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 88a81fbefb..4f0c0af38e 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -89,13 +89,14 @@ locals { data "google_compute_image" "compute_image" { family = try(var.instance_image.family, null) name = try(var.instance_image.name, null) - project = var.instance_image.project + project = try(var.instance_image.project, null) } resource "null_resource" "image" { triggers = { - image = var.instance_image.family, - project = var.instance_image.project + name = try(var.instance_image.name, null), + family = try(var.instance_image.family, null), + project = try(var.instance_image.project, null) } } diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 6a9d4a7c94..4b04405535 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 3af9141cad..addcd26130 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.25.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.25.0" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/scripts/install-nfs-client.sh b/modules/file-system/filestore/scripts/install-nfs-client.sh index 6c49163eb2..7ce1a59ed3 100644 --- a/modules/file-system/filestore/scripts/install-nfs-client.sh +++ b/modules/file-system/filestore/scripts/install-nfs-client.sh @@ -28,7 +28,7 @@ if [ ! "$(which mount.nfs)" ]; then fi yum install --disablerepo="*" --enablerepo=${enable_repo} -y nfs-utils elif [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release || grep -qi ubuntu /etc/os-release; then - apt-get -y update + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label apt-get -y install nfs-common else echo 'Unsuported distribution' diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index ce47253878..97d4a19ab6 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.25.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.25.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-gcs-fuse.sh b/modules/file-system/pre-existing-network-storage/scripts/install-gcs-fuse.sh index 04498fd849..f8a990260b 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-gcs-fuse.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-gcs-fuse.sh @@ -32,11 +32,11 @@ EOF elif [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release || grep -qi ubuntu /etc/os-release; then RELEASE=$(lsb_release -c -s) export GCSFUSE_REPO="gcsfuse-${RELEASE}" - echo "deb http://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list + echo "deb https://packages.cloud.google.com/apt $GCSFUSE_REPO main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - - sudo apt-get update - sudo apt-get -y install gcsfuse + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label + apt-get -y install gcsfuse else echo 'Unsuported distribution' return 1 diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-nfs-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-nfs-client.sh index 6c49163eb2..7ce1a59ed3 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-nfs-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-nfs-client.sh @@ -28,7 +28,7 @@ if [ ! "$(which mount.nfs)" ]; then fi yum install --disablerepo="*" --enablerepo=${enable_repo} -y nfs-utils elif [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release || grep -qi ubuntu /etc/os-release; then - apt-get -y update + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label apt-get -y install nfs-common else echo 'Unsuported distribution' diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 1a8e5099a6..a6e68b2c88 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.25.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 88bda1cddc..3c3755c4e3 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.25.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index f4e9833321..ea41597231 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -135,7 +135,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 6a9d4a7c94..4b04405535 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 2d552914f6..b8edc77b98 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index 7c890cc048..0adfd0f2fb 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 60a9ee5766..0359c151b6 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.25.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/files/install_ansible.sh b/modules/scripts/startup-script/files/install_ansible.sh index 11f675850d..6665c7f2a7 100644 --- a/modules/scripts/startup-script/files/install_ansible.sh +++ b/modules/scripts/startup-script/files/install_ansible.sh @@ -15,6 +15,8 @@ REQ_ANSIBLE_VERSION=2.11 REQ_ANSIBLE_PIP_VERSION=4.10.0 +REQ_PIP_WHEEL_VERSION=0.37.1 +REQ_PIP_SETUPTOOLS_VERSION=59.6.0 REQ_PIP_MAJOR_VERSION=21 REQ_PYTHON3_VERSION=6 @@ -39,7 +41,8 @@ apt_wait() { install_python_deps() { if [ -f /etc/debian_version ] || grep -qi ubuntu /etc/lsb-release 2>/dev/null || grep -qi ubuntu /etc/os-release 2>/dev/null; then - apt-get install -y python3-distutils + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label + apt-get install -y python3-distutils python3-venv fi } @@ -81,7 +84,7 @@ install_python3_yum() { echo "Unsupported version of centos/RHEL/Rocky" return 1 fi - yum install --disablerepo="*" --enablerepo="${enable_repo}" -y python3 python3-pip + yum install --disablerepo="*" --enablerepo="${enable_repo}" -y python3 python3-pip python3-venv python_path=$(rpm -ql python3 | grep 'bin/python3$') } @@ -89,7 +92,8 @@ install_python3_yum() { # newly installed packaged. install_python3_apt() { apt_wait - apt-get install -y python3 python3-distutils python3-pip + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label + apt-get install -y python3 python3-distutils python3-pip python3-venv python_path=$(command -v python3) } @@ -125,7 +129,7 @@ install_pip3_yum() { # Install python3 with the apt package manager. Updates python_path to the # newly installed packaged. install_pip3_apt() { - apt-get update + apt-get update --allow-releaseinfo-change-origin --allow-releaseinfo-change-label apt-get install -y python3-pip } @@ -170,17 +174,30 @@ main() { return 1 fi fi - pip_version=$(${python_path} -m pip --version | sed -nr 's/^pip ([0-9]+\.[0-9]+).*$/\1/p') - pip_major_version=$(echo "${pip_version}" | cut -d '.' -f 1) - if [ "${pip_major_version}" -lt "${REQ_PIP_MAJOR_VERSION}" ]; then - ${python_path} -m pip install --upgrade pip + + # Upgrade pip if necesary + # Only run if OS is not Debian 12 - Debian 12 does not allow for system level pip installs + if [ ! -f /etc/debian_version ] || [ "$(lsb_release -a 2>/dev/null | sed -n 's/Release:\s\+\([0-9]\+\).\?.*/\1/p')" -ne "12" ]; then + pip_version=$(${python_path} -m pip --version | sed -nr 's/^pip ([0-9]+\.[0-9]+).*$/\1/p') + pip_major_version=$(echo "${pip_version}" | cut -d '.' -f 1) + if [ "${pip_major_version}" -lt "${REQ_PIP_MAJOR_VERSION}" ]; then + ${python_path} -m pip install --upgrade pip + fi fi # Create pip virtual environment for HPC Toolkit - ${python_path} -m pip install virtualenv - ${python_path} -m virtualenv "${venv_path}" + ${python_path} -m venv "${venv_path}" venv_python_path=${venv_path}/bin/python3 + # Upgrade pip if necesary + pip_version=$(${venv_python_path} -m pip --version | sed -nr 's/^pip ([0-9]+\.[0-9]+).*$/\1/p') + pip_major_version=$(echo "${pip_version}" | cut -d '.' -f 1) + if [ "${pip_major_version}" -lt "${REQ_PIP_MAJOR_VERSION}" ]; then + ${venv_python_path} -m pip install --upgrade pip + fi + + ${venv_python_path} -m pip install -U wheel==${REQ_PIP_WHEEL_VERSION} setuptools==${REQ_PIP_SETUPTOOLS_VERSION} + # configure ansible to always use correct Python binary if [ ! -f /etc/ansible/ansible.cfg ]; then mkdir /etc/ansible diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 4278a1647f..66b7655a07 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.24.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.25.0" } required_version = ">= 0.14.0" diff --git a/pkg/config/config.go b/pkg/config/config.go index a8e3dd48a0..65dd2a9f63 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -40,10 +40,9 @@ const ( var errorMessages = map[string]string{ // config - "fileLoadError": "failed to read the input yaml", - "yamlUnmarshalError": "failed to parse the blueprint in %s, check YAML syntax for errors, err=%w", - "yamlMarshalError": "failed to export the configuration to a blueprint yaml file", - "fileSaveError": "failed to write the expanded yaml", + "fileLoadError": "failed to read the input yaml", + "yamlMarshalError": "failed to export the configuration to a blueprint yaml file", + "fileSaveError": "failed to write the expanded yaml", // expand "missingSetting": "a required setting is missing from a module", "invalidVar": "invalid variable definition in", @@ -384,7 +383,7 @@ func checkMovedModule(source string) error { func NewDeploymentConfig(configFilename string) (DeploymentConfig, YamlCtx, error) { bp, ctx, err := importBlueprint(configFilename) if err != nil { - return DeploymentConfig{}, YamlCtx{}, err + return DeploymentConfig{}, ctx, err } // if the validation level has been explicitly set to an invalid value // in YAML blueprint then silently default to validationError @@ -558,14 +557,15 @@ func isValidLabelValue(value string) bool { // DeploymentName returns the deployment_name from the config and does approperate checks. func (bp *Blueprint) DeploymentName() (string, error) { + path := Root.Vars.Dot("deployment_name") + if !bp.Vars.Has("deployment_name") { - return "", InputValueError{ + return "", BpError{path, InputValueError{ inputKey: "deployment_name", cause: errorMessages["varNotFound"], - } + }} } - path := Root.Vars.Dot("deployment_name") v := bp.Vars.Get("deployment_name") if v.Type() != cty.String { return "", BpError{path, InputValueError{ diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 3a149446e9..881cd15825 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -61,10 +61,44 @@ func (dc *DeploymentConfig) expand() error { return err } + if err := validateInputsAllModules(dc.Config); err != nil { + return err + } + dc.Config.populateOutputs() return nil } +func validateInputsAllModules(bp Blueprint) error { + errs := Errors{} + for ig, g := range bp.DeploymentGroups { + for im, m := range g.Modules { + p := Root.Groups.At(ig).Modules.At(im) + errs.Add(validateModuleInputs(p, m, bp)) + } + } + return errs.OrNil() +} + +func validateModuleInputs(mp modulePath, m Module, bp Blueprint) error { + mi := m.InfoOrDie() + errs := Errors{} + for _, input := range mi.Inputs { + ip := mp.Settings.Dot(input.Name) + + if !m.Settings.Has(input.Name) { + if input.Required { + errs.At(ip, fmt.Errorf("%s: Module ID: %s Setting: %s", + errorMessages["missingSetting"], m.ID, input.Name)) + } + continue + } + + // TODO: Check set value and input dtypes convertability + } + return errs.OrNil() +} + func (dc *DeploymentConfig) expandBackends() { // 1. DEFAULT: use TerraformBackend configuration (if supplied) in each // resource group @@ -275,16 +309,7 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { if bp.Vars.Has(input.Name) { ref := GlobalRef(input.Name) mod.Settings.Set(input.Name, ref.AsExpression().AsValue()) - continue - } - - if input.Required { - // It's not explicitly set, and not global is set - // Fail if no default has been set - return fmt.Errorf("%s: Module ID: %s Setting: %s", - errorMessages["missingSetting"], mod.ID, input.Name) } - // Default exists, the module will handle it } return nil } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index db413f1527..6405888def 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -345,11 +345,6 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { }}, }) - err := dc.applyGlobalVariables() - expectedErrorStr := fmt.Sprintf("%s: Module ID: %s Setting: gold", - errorMessages["missingSetting"], mod.ID) - c.Check(err, ErrorMatches, expectedErrorStr) - // Test no input, one required, exists in globals dc.Config.Vars.Set("gold", cty.StringVal("val")) c.Check(dc.applyGlobalVariables(), IsNil) @@ -360,8 +355,7 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { // Test one input, one required mod.Settings.Set(requiredVar.Name, cty.StringVal("val")) - err = dc.applyGlobalVariables() - c.Assert(err, IsNil) + c.Assert(dc.applyGlobalVariables(), IsNil) // Test one input, none required, exists in globals setTestModuleInfo(*mod, modulereader.ModuleInfo{ @@ -371,8 +365,7 @@ func (s *MySuite) TestApplyGlobalVariables(c *C) { Required: false, }}, }) - err = dc.applyGlobalVariables() - c.Assert(err, IsNil) + c.Assert(dc.applyGlobalVariables(), IsNil) } func (s *MySuite) TestIsSimpleVariable(c *C) { diff --git a/pkg/config/path.go b/pkg/config/path.go index 220c1644c0..fef57d7b56 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -17,6 +17,9 @@ package config import ( "fmt" "reflect" + + "github.com/pkg/errors" + "github.com/zclconf/go-cty/cty" ) // Path is unique identifier of a piece of configuration. @@ -56,6 +59,49 @@ func (p mapPath[E]) Dot(k string) E { return e } +// ctyPath is a specialization of Path that can be extended with cty.Path +type ctyPath struct{ basePath } + +// Cty builds a path chain that starts with p and each link corresponds to a step in cty.Path +// If any step in cty.Path is not supported, the path chain will be built up to that point. +// E.g. +// Root.Vars.Dot("alpha").Cty(cty.Path{}.IndexInt(6)) == "vars.alpha[6]" +func (p ctyPath) Cty(cp cty.Path) basePath { + cur := p.basePath + for _, s := range cp { + prev := cur + var nxt basePath + piece, err := ctyStepToString(s) + if err != nil { + return cur // fall back to longest path build up to this point + } + initPath(&nxt, &prev, piece) + cur = nxt + } + return cur +} + +func ctyStepToString(s cty.PathStep) (string, error) { + switch s := s.(type) { + case cty.GetAttrStep: + return fmt.Sprintf(".%s", s.Name), nil // equivalent to mapPath.Dot + case cty.IndexStep: + switch s.Key.Type() { + case cty.Number: + return fmt.Sprintf("[%s]", s.Key.AsBigFloat().String()), nil // equivalent to arrayPath.At + case cty.String: + return fmt.Sprintf(".%s", s.Key.AsString()), nil // equivalent to mapPath.Dot + default: + return "", errors.New("key value not number or string") + } + default: + return "", errors.Errorf("unknown cty.PathStep type: %#v", s) + } +} + +// initPath walks through all child paths of p and initializes them. E.g. +// initPath(&Root, nil, "") will trigger +// -> initPath(&Root.BlueprintName, &Root, "blueprint_name") func initPath(p any, prev any, piece string) { r := reflect.Indirect(reflect.ValueOf(p)) ty := reflect.TypeOf(p).Elem() @@ -98,7 +144,7 @@ type validatorCfgPath struct { Skip basePath `path:".skip"` } -type dictPath struct{ mapPath[basePath] } +type dictPath struct{ mapPath[ctyPath] } type backendPath struct { basePath @@ -115,12 +161,12 @@ type groupPath struct { type modulePath struct { basePath - Source basePath `path:".source"` - Kind basePath `path:".kind"` - ID basePath `path:".id"` - Use arrayPath[backendPath] `path:".use"` - Outputs arrayPath[outputPath] `path:".outputs"` - Settings dictPath `path:".settings"` + Source basePath `path:".source"` + Kind basePath `path:".kind"` + ID basePath `path:".id"` + Use arrayPath[basePath] `path:".use"` + Outputs arrayPath[outputPath] `path:".outputs"` + Settings dictPath `path:".settings"` } type outputPath struct { @@ -133,6 +179,9 @@ type outputPath struct { // Root is a starting point for creating a Blueprint Path var Root rootPath +// internalPath is to be used to report problems outside of Blueprint schema (e.g. YAML parsing error position) +var internalPath = mapPath[basePath]{basePath{nil, "__internal_path__"}} + func init() { initPath(&Root, nil, "") } diff --git a/pkg/config/path_test.go b/pkg/config/path_test.go index 0d1cb936e2..adc702f666 100644 --- a/pkg/config/path_test.go +++ b/pkg/config/path_test.go @@ -16,6 +16,8 @@ package config import ( "testing" + + "github.com/zclconf/go-cty/cty" ) func TestPath(t *testing.T) { @@ -42,6 +44,11 @@ func TestPath(t *testing.T) { {r.Validators.At(2).Inputs.Dot("zebra"), "validators[2].inputs.zebra"}, {r.Vars.Dot("red"), "vars.red"}, + {r.Vars.Dot("red").Cty(cty.Path{}), "vars.red"}, + {r.Vars.Dot("red").Cty(cty.Path{}.IndexInt(6)), "vars.red[6]"}, + {r.Vars.Dot("red").Cty(cty.Path{}.IndexInt(6).GetAttr("silver")), "vars.red[6].silver"}, + {r.Vars.Dot("red").Cty(cty.Path{}.IndexInt(6).IndexString("silver")), "vars.red[6].silver"}, + {r.Vars.Dot("red").Cty(cty.Path{}.IndexInt(6).Index(cty.True)), "vars.red[6]"}, // trim last piece as invalid {r.Groups.At(3), "deployment_groups[3]"}, {r.Groups.At(3).Name, "deployment_groups[3].group"}, @@ -65,6 +72,9 @@ func TestPath(t *testing.T) { {r.Backend.Type, "terraform_backend_defaults.type"}, {r.Backend.Configuration, "terraform_backend_defaults.configuration"}, {r.Backend.Configuration.Dot("goo"), "terraform_backend_defaults.configuration.goo"}, + + {internalPath, "__internal_path__"}, + {internalPath.Dot("a"), "__internal_path__.a"}, } for _, tc := range tests { t.Run(tc.want, func(t *testing.T) { @@ -82,12 +92,19 @@ func TestPathParent(t *testing.T) { want Path } r := Root + cp := cty.Path{} // empty cty.Path tests := []test{ {r, nil}, {r.Groups, r}, {r.Groups.At(3), r.Groups}, {r.Groups.At(3).Modules, r.Groups.At(3)}, {r.Vars.Dot("red"), r.Vars}, + {r.Vars.Dot("red").Cty(cp), r.Vars}, + {r.Vars.Dot("red").Cty(cp.IndexInt(6)), r.Vars.Dot("red")}, + {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg")), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, + {r.Vars.Dot("red").Cty(cp.IndexInt(6).IndexString("gg").Index(cty.True)), r.Vars.Dot("red").Cty(cp.IndexInt(6))}, + {internalPath, nil}, + {internalPath.Dot("gold"), internalPath}, } for _, tc := range tests { t.Run(tc.p.String(), func(t *testing.T) { diff --git a/pkg/config/validate.go b/pkg/config/validate.go index b675472f7b..7e1b102ee4 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -49,19 +49,20 @@ func validateGlobalLabels(vars Dict) error { errs.At(p, errors.New("vars.labels cannot have more than 64 labels")) } for k, v := range labels.AsValueMap() { - // TODO: Use cty.Path to point to the specific label that is invalid + vp := p.Cty(cty.Path{}.IndexString(k)) if v.Type() != cty.String { - errs.At(p, errors.New("vars.labels must be a map of strings")) + errs.At(vp, errors.New("vars.labels must be a map of strings")) + continue } s := v.AsString() // Check that label names are valid if !isValidLabelName(k) { - errs.At(p, errors.Errorf("%s: '%s: %s'", errorMessages["labelNameReqs"], k, s)) + errs.At(vp, errors.Errorf("%s: '%s: %s'", errorMessages["labelNameReqs"], k, s)) } // Check that label values are valid if !isValidLabelValue(s) { - errs.At(p, errors.Errorf("%s: '%s: %s'", errorMessages["labelValueReqs"], k, s)) + errs.At(vp, errors.Errorf("%s: '%s: %s'", errorMessages["labelValueReqs"], k, s)) } } return errs.OrNil() diff --git a/pkg/config/yaml.go b/pkg/config/yaml.go index 78ab8157c2..b3c2077295 100644 --- a/pkg/config/yaml.go +++ b/pkg/config/yaml.go @@ -21,7 +21,9 @@ import ( "fmt" "os" "regexp" + "strconv" + "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "github.com/zclconf/go-cty/cty/gocty" ctyJson "github.com/zclconf/go-cty/cty/json" @@ -58,11 +60,24 @@ func importBlueprint(f string) (Blueprint, YamlCtx, error) { decoder := yaml.NewDecoder(bytes.NewReader(data)) decoder.KnownFields(true) + yamlCtx, err := NewYamlCtx(data) + if err != nil { // YAML parsing error + return Blueprint{}, yamlCtx, err + } + var bp Blueprint if err = decoder.Decode(&bp); err != nil { - return Blueprint{}, YamlCtx{}, fmt.Errorf(errorMessages["yamlUnmarshalError"], f, err) + errs := Errors{} + for i, yep := range parseYamlV3Error(err) { + path := internalPath.Dot(fmt.Sprintf("bp_schema_error_%d", i)) + if yep.pos.Line != 0 { + yamlCtx.pathToPos[yPath(path.String())] = yep.pos + } + errs.At(path, errors.New(yep.errMsg)) + } + return Blueprint{}, yamlCtx, errs } - return bp, NewYamlCtx(data), nil + return bp, yamlCtx, nil } // YamlCtx is a contextual information to render errors. @@ -120,38 +135,52 @@ func normalizeYamlNode(p yPath, n *yaml.Node) *yaml.Node { // NewYamlCtx creates a new YamlCtx from a given YAML data. // NOTE: The data should be a valid blueprint YAML (previously used to parse Blueprint), // this function will panic if it's not valid YAML and doesn't validate Blueprint structure. -func NewYamlCtx(data []byte) YamlCtx { +func NewYamlCtx(data []byte) (YamlCtx, error) { + var lines []string + sc := bufio.NewScanner(bytes.NewReader(data)) + for sc.Scan() { + lines = append(lines, sc.Text()) + } + var c nodeCapturer + m := map[yPath]Pos{} + + // error may happen if YAML is not valid, regardless of Blueprint schema if err := yaml.Unmarshal(data, &c); err != nil { - panic(err) // shouldn't happen - } - if c.n == nil { - return YamlCtx{} // empty + errs := Errors{} + for i, yep := range parseYamlV3Error(err) { + path := internalPath.Dot(fmt.Sprintf("yaml_error_%d", i)) + if yep.pos.Line != 0 { + m[yPath(path.String())] = yep.pos + } + errs.At(path, errors.New(yep.errMsg)) + } + return YamlCtx{m, lines}, errs } - m := map[yPath]Pos{} - var walk func(n *yaml.Node, p yPath) - walk = func(n *yaml.Node, p yPath) { + var walk func(n *yaml.Node, p yPath, posOf *yaml.Node) + walk = func(n *yaml.Node, p yPath, posOf *yaml.Node) { n = normalizeYamlNode(p, n) - m[p] = Pos{n.Line, n.Column} + if posOf == nil { // use position of node itself if posOf is not set + posOf = n + } + m[p] = Pos{posOf.Line, posOf.Column} + if n.Kind == yaml.MappingNode { for i := 0; i < len(n.Content); i += 2 { - walk(n.Content[i+1], p.Dot(n.Content[i].Value)) + // for mapping items use position of the key + walk(n.Content[i+1], p.Dot(n.Content[i].Value), n.Content[i]) } } else if n.Kind == yaml.SequenceNode { for i, c := range n.Content { - walk(c, p.At(i)) + walk(c, p.At(i), nil) } } } - walk(c.n, "") - - var lines []string - sc := bufio.NewScanner(bytes.NewReader(data)) - for sc.Scan() { - lines = append(lines, sc.Text()) + if c.n != nil { + walk(c.n, "", nil) } - return YamlCtx{m, lines} + return YamlCtx{m, lines}, nil } type nodeCapturer struct{ n *yaml.Node } @@ -220,7 +249,7 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { } ty, err := gocty.ImpliedType(s) if err != nil { - return err + return fmt.Errorf("line %d: %w", n.Line, err) } if y.v, err = gocty.ToCtyValue(s, ty); err != nil { return err @@ -229,13 +258,15 @@ func (y *YamlValue) unmarshalScalar(n *yaml.Node) error { if l, is := IsYamlExpressionLiteral(y.v); is { // HCL literal var e Expression if e, err = ParseExpression(l); err != nil { - return err + // TODO: point to exact location within expression, see Diagnostic.Subject + return fmt.Errorf("line %d: %w", n.Line, err) } y.v = e.AsValue() } else if y.v.Type() == cty.String && hasVariable(y.v.AsString()) { // "simple" variable e, err := SimpleVarToExpression(y.v.AsString()) if err != nil { - return err + // TODO: point to exact location within expression, see Diagnostic.Subject + return fmt.Errorf("line %d: %w", n.Line, err) } y.v = e.AsValue() } @@ -305,3 +336,41 @@ func (d Dict) MarshalYAML() (interface{}, error) { } return g, nil } + +type yamlErrWithPos struct { + pos Pos + errMsg string +} + +// yaml.v3 errors are either TypeError - collection of error message or single error message. +// Parse error messages to extract short error message and position. +func parseYamlV3Error(err error) []yamlErrWithPos { + res := []yamlErrWithPos{} + switch err := err.(type) { + case *yaml.TypeError: + for _, s := range err.Errors { + res = append(res, parseYamlV3ErrorString(s)) + } + default: + res = append(res, parseYamlV3ErrorString(err.Error())) + } + + if len(res) == 0 { // should never happen + res = append(res, parseYamlV3ErrorString(err.Error())) + } + return res +} + +// parseYamlV3Error attempts to extract position and nice error message from yaml.v3 error message. +// yaml.v3 errors are unstructured, use string parsing to extract information. +// If no position can be extracted, returns (Pos{}, error.Error()). +// Else returns (Pos{Line: line_number}, error_message). +func parseYamlV3ErrorString(s string) yamlErrWithPos { + match := regexp.MustCompile(`^(yaml: )?(line (\d+): )?(.*)$`).FindStringSubmatch(s) + if match == nil { + return yamlErrWithPos{Pos{}, s} + } + lns, errMsg := match[3], match[4] + ln, _ := strconv.Atoi(lns) // Atoi returns 0 on error, which is fine here + return yamlErrWithPos{Pos{Line: ln}, errMsg} +} diff --git a/pkg/config/yaml_test.go b/pkg/config/yaml_test.go index 4e4dc41d97..f65f4d6cc3 100644 --- a/pkg/config/yaml_test.go +++ b/pkg/config/yaml_test.go @@ -86,59 +86,62 @@ terraform_backend_defaults: } tests := []test{ {Root, Pos{3, 1}}, - {Root.BlueprintName, Pos{3, 17}}, - {Root.GhpcVersion, Pos{5, 15}}, - {Root.Validators, Pos{8, 1}}, + {Root.BlueprintName, Pos{3, 1}}, + {Root.GhpcVersion, Pos{5, 1}}, + {Root.Validators, Pos{7, 1}}, {Root.Validators.At(0), Pos{8, 3}}, - {Root.Validators.At(0).Inputs, Pos{10, 5}}, - {Root.Validators.At(0).Inputs.Dot("spice"), Pos{10, 12}}, - {Root.Validators.At(0).Validator, Pos{8, 14}}, + {Root.Validators.At(0).Validator, Pos{8, 3}}, + {Root.Validators.At(0).Inputs, Pos{9, 3}}, + {Root.Validators.At(0).Inputs.Dot("spice"), Pos{10, 5}}, + {Root.Validators.At(1).Validator, Pos{11, 3}}, {Root.Validators.At(1), Pos{11, 3}}, - {Root.Validators.At(1).Skip, Pos{12, 9}}, - {Root.Validators.At(1).Validator, Pos{11, 14}}, - {Root.ValidationLevel, Pos{14, 19}}, - {Root.Vars, Pos{17, 3}}, - {Root.Vars.Dot("red"), Pos{17, 8}}, - {Root.Groups, Pos{20, 1}}, + {Root.Validators.At(1).Skip, Pos{12, 3}}, + {Root.ValidationLevel, Pos{14, 1}}, + {Root.Vars, Pos{16, 1}}, + {Root.Vars.Dot("red"), Pos{17, 3}}, + {Root.Groups, Pos{19, 1}}, {Root.Groups.At(0), Pos{20, 3}}, - {Root.Groups.At(0).Name, Pos{20, 10}}, + {Root.Groups.At(0).Name, Pos{20, 3}}, - {Root.Groups.At(0).Backend, Pos{22, 5}}, - {Root.Groups.At(0).Backend.Type, Pos{22, 11}}, - {Root.Groups.At(0).Backend.Configuration, Pos{24, 7}}, - {Root.Groups.At(0).Backend.Configuration.Dot("carrot"), Pos{24, 15}}, + {Root.Groups.At(0).Backend, Pos{21, 3}}, + {Root.Groups.At(0).Backend.Type, Pos{22, 5}}, + {Root.Groups.At(0).Backend.Configuration, Pos{23, 5}}, + {Root.Groups.At(0).Backend.Configuration.Dot("carrot"), Pos{24, 7}}, - {Root.Groups.At(0).Modules, Pos{26, 3}}, + {Root.Groups.At(0).Modules, Pos{25, 3}}, {Root.Groups.At(0).Modules.At(0), Pos{26, 5}}, - {Root.Groups.At(0).Modules.At(0).ID, Pos{26, 9}}, - {Root.Groups.At(0).Modules.At(0).Source, Pos{27, 13}}, - {Root.Groups.At(0).Modules.At(0).Kind, Pos{28, 11}}, - {Root.Groups.At(0).Modules.At(0).Use, Pos{29, 10}}, + {Root.Groups.At(0).Modules.At(0).ID, Pos{26, 5}}, + {Root.Groups.At(0).Modules.At(0).Source, Pos{27, 5}}, + {Root.Groups.At(0).Modules.At(0).Kind, Pos{28, 5}}, + {Root.Groups.At(0).Modules.At(0).Use, Pos{29, 5}}, {Root.Groups.At(0).Modules.At(0).Use.At(0), Pos{29, 11}}, {Root.Groups.At(0).Modules.At(0).Use.At(1), Pos{29, 18}}, - {Root.Groups.At(0).Modules.At(0).Outputs, Pos{31, 5}}, + {Root.Groups.At(0).Modules.At(0).Outputs, Pos{30, 5}}, {Root.Groups.At(0).Modules.At(0).Outputs.At(0), Pos{31, 7}}, {Root.Groups.At(0).Modules.At(0).Outputs.At(0).Name, Pos{31, 7}}, // synthetic {Root.Groups.At(0).Modules.At(0).Outputs.At(1), Pos{32, 7}}, - {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Name, Pos{32, 13}}, - {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Description, Pos{33, 20}}, - {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Sensitive, Pos{34, 18}}, - {Root.Groups.At(0).Modules.At(0).Settings, Pos{36, 7}}, - {Root.Groups.At(0).Modules.At(0).Settings.Dot("dijon"), Pos{36, 14}}, + {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Name, Pos{32, 7}}, + {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Description, Pos{33, 7}}, + {Root.Groups.At(0).Modules.At(0).Outputs.At(1).Sensitive, Pos{34, 7}}, + {Root.Groups.At(0).Modules.At(0).Settings, Pos{35, 5}}, + {Root.Groups.At(0).Modules.At(0).Settings.Dot("dijon"), Pos{36, 7}}, {Root.Groups.At(1), Pos{38, 3}}, - {Root.Groups.At(1).Name, Pos{38, 10}}, - {Root.Groups.At(1).Modules, Pos{40, 3}}, + {Root.Groups.At(1).Name, Pos{38, 3}}, + {Root.Groups.At(1).Modules, Pos{39, 3}}, {Root.Groups.At(1).Modules.At(0), Pos{40, 5}}, - {Root.Groups.At(1).Modules.At(0).ID, Pos{40, 9}}, + {Root.Groups.At(1).Modules.At(0).ID, Pos{40, 5}}, {Root.Groups.At(1).Modules.At(1), Pos{41, 5}}, - {Root.Groups.At(1).Modules.At(1).ID, Pos{41, 9}}, + {Root.Groups.At(1).Modules.At(1).ID, Pos{41, 5}}, - {Root.Backend, Pos{44, 3}}, - {Root.Backend.Type, Pos{44, 9}}, + {Root.Backend, Pos{43, 1}}, + {Root.Backend.Type, Pos{44, 3}}, } - ctx := NewYamlCtx([]byte(data)) + ctx, err := NewYamlCtx([]byte(data)) + if err != nil { + t.Fatal(err) + } for _, tc := range tests { t.Run(tc.path.String(), func(t *testing.T) { got, ok := ctx.Pos(tc.path) diff --git a/pkg/modulereader/metadata.go b/pkg/modulereader/metadata.go new file mode 100644 index 0000000000..4935378f79 --- /dev/null +++ b/pkg/modulereader/metadata.go @@ -0,0 +1,78 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package modulereader + +import ( + "errors" + "hpc-toolkit/pkg/sourcereader" + "os" + "path/filepath" + + "gopkg.in/yaml.v3" +) + +// Metadata corresponds to BlueprintMetadata in CFT schema +// See https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/master/cli/bpmetadata/schema/gcp-blueprint-metadata.json#L278 +type Metadata struct { + Spec MetadataSpec `yaml:"spec"` +} + +// MetadataSpec corresponds to BlueprintMetadataSpec in CFT schema +// See https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/master/cli/bpmetadata/schema/gcp-blueprint-metadata.json#L299 +type MetadataSpec struct { + Requirements MetadataRequirements `yaml:"requirements"` +} + +// MetadataRequirements corresponds to BlueprintRequirements in CFT schema +// See https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/blob/master/cli/bpmetadata/schema/gcp-blueprint-metadata.json#L416 +type MetadataRequirements struct { + Services []string `yaml:"services"` +} + +// GetMetadata reads and parses `metadata.yaml` from module root. +// Expects source to be either a local or embedded path. +func GetMetadata(source string) (Metadata, error) { + var err error + var data []byte + filePath := filepath.Join(source, "metadata.yaml") + + switch { + case sourcereader.IsEmbeddedPath(source): + data, err = sourcereader.ModuleFS.ReadFile(filePath) + case sourcereader.IsLocalPath(source): + var absPath string + if absPath, err = filepath.Abs(filePath); err == nil { + data, err = os.ReadFile(absPath) + } + default: + err = errors.New("source must be local or embedded") + } + if err != nil { + return Metadata{}, err + } + + var mtd Metadata + err = yaml.Unmarshal(data, &mtd) + return mtd, err +} + +// GetMetadataSafe attempts to GetMetadata if it fails returns +// hardcoded legacy metadata. +func GetMetadataSafe(source string) Metadata { + if mtd, err := GetMetadata(source); err == nil { + return mtd + } + return legacyMetadata(source) +} diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go new file mode 100644 index 0000000000..5ef9f05704 --- /dev/null +++ b/pkg/modulereader/metadata_legacy.go @@ -0,0 +1,192 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package modulereader + +import ( + "strings" +) + +func legacyMetadata(source string) Metadata { + services := []string{} + if idx := strings.LastIndex(source, "community/modules/"); idx != -1 { + services = defaultAPIList(source[idx:]) + } else if idx := strings.LastIndex(source, "modules/"); idx != -1 { + services = defaultAPIList(source[idx:]) + } + + return Metadata{ + Spec: MetadataSpec{ + Requirements: MetadataRequirements{ + Services: services, + }, + }, + } +} + +func defaultAPIList(source string) []string { + // API lists at + // https://console.cloud.google.com/apis/dashboard and + // https://console.cloud.google.com/apis/library + staticAPIMap := map[string][]string{ + "community/modules/compute/SchedMD-slurm-on-gcp-partition": { + "compute.googleapis.com", + }, + "community/modules/compute/htcondor-execute-point": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/compute/pbspro-execution": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/compute/schedmd-slurm-gcp-v5-partition": { + "compute.googleapis.com", + }, + "community/modules/database/slurm-cloudsql-federation": { + "bigqueryconnection.googleapis.com", + "sqladmin.googleapis.com", + }, + "community/modules/file-system/DDN-EXAScaler": { + "compute.googleapis.com", + "deploymentmanager.googleapis.com", + "iam.googleapis.com", + "runtimeconfig.googleapis.com", + }, + "community/modules/file-system/Intel-DAOS": { + "compute.googleapis.com", + "iam.googleapis.com", + "secretmanager.googleapis.com", + }, + "community/modules/file-system/nfs-server": { + "compute.googleapis.com", + }, + "community/modules/project/new-project": { + "admin.googleapis.com", + "cloudresourcemanager.googleapis.com", + "cloudbilling.googleapis.com", + "iam.googleapis.com", + }, + "community/modules/project/service-account": { + "iam.googleapis.com", + }, + "community/modules/project/service-enablement": { + "serviceusage.googleapis.com", + }, + "community/modules/scheduler/SchedMD-slurm-on-gcp-controller": { + "compute.googleapis.com", + }, + "community/modules/scheduler/SchedMD-slurm-on-gcp-login-node": { + "compute.googleapis.com", + }, + "community/modules/compute/gke-node-pool": { + "container.googleapis.com", + }, + "community/modules/scheduler/gke-cluster": { + "container.googleapis.com", + }, + "modules/scheduler/batch-job-template": { + "batch.googleapis.com", + "compute.googleapis.com", + }, + "modules/scheduler/batch-login-node": { + "batch.googleapis.com", + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/htcondor-access-point": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/htcondor-central-manager": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/htcondor-pool-secrets": { + "iam.googleapis.com", + "secretmanager.googleapis.com", + }, + "community/modules/scheduler/htcondor-setup": { + "iam.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/pbspro-client": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/pbspro-server": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/schedmd-slurm-gcp-v5-controller": { + "compute.googleapis.com", + "iam.googleapis.com", + "pubsub.googleapis.com", + "secretmanager.googleapis.com", + }, + "community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid": { + "compute.googleapis.com", + "pubsub.googleapis.com", + }, + "community/modules/scheduler/schedmd-slurm-gcp-v5-login": { + "compute.googleapis.com", + }, + "community/modules/scripts/htcondor-install": {}, + "community/modules/scripts/omnia-install": {}, + "community/modules/scripts/pbspro-preinstall": { + "iam.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scripts/pbspro-install": {}, + "community/modules/scripts/pbspro-qmgr": {}, + "community/modules/scripts/spack-setup": { + "storage.googleapis.com", + }, + "community/modules/scripts/wait-for-startup": { + "compute.googleapis.com", + }, + "modules/compute/vm-instance": { + "compute.googleapis.com", + }, + "modules/file-system/filestore": { + "file.googleapis.com", + }, + "modules/file-system/cloud-storage-bucket": { + "storage.googleapis.com", + }, + "modules/file-system/pre-existing-network-storage": {}, + "modules/monitoring/dashboard": { + "stackdriver.googleapis.com", + }, + "modules/network/pre-existing-vpc": { + "compute.googleapis.com", + }, + "modules/network/vpc": { + "compute.googleapis.com", + }, + "modules/packer/custom-image": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "modules/scripts/startup-script": { + "storage.googleapis.com", + }, + } + + requiredAPIs, found := staticAPIMap[source] + if !found { + return []string{} + } + return requiredAPIs +} diff --git a/pkg/modulereader/modules/imaginarium/zebra/main.pkr.hcl b/pkg/modulereader/modules/imaginarium/zebra/main.pkr.hcl new file mode 100644 index 0000000000..6a7b4a5930 --- /dev/null +++ b/pkg/modulereader/modules/imaginarium/zebra/main.pkr.hcl @@ -0,0 +1,20 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +module "test_module" { + source = "testSource" +} +data "test_data" "test_data_name" { + name = "test_data_name" +} diff --git a/pkg/modulereader/modules/imaginarium/zebra/variables.pkr.hcl b/pkg/modulereader/modules/imaginarium/zebra/variables.pkr.hcl new file mode 100644 index 0000000000..94c536bc38 --- /dev/null +++ b/pkg/modulereader/modules/imaginarium/zebra/variables.pkr.hcl @@ -0,0 +1,18 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +variable "test_variable" { + description = "This is just a test" + type = string +} diff --git a/tools/cloud-build/build-test-go1-18.yaml b/pkg/modulereader/modules/test_role/test_module/metadata.yaml similarity index 74% rename from tools/cloud-build/build-test-go1-18.yaml rename to pkg/modulereader/modules/test_role/test_module/metadata.yaml index bba131194e..491df62c9a 100644 --- a/tools/cloud-build/build-test-go1-18.yaml +++ b/pkg/modulereader/modules/test_role/test_module/metadata.yaml @@ -1,10 +1,10 @@ -# Copyright 2022 Google LLC +# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -14,9 +14,8 @@ --- -steps: -- name: golang:1.18 - entrypoint: /bin/bash - args: - - -c - - make ghpc test-engine +spec: + requirements: + services: + - room.service.vip + - protection.service.GCPD diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index cddf11174f..314b902d48 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -23,7 +23,6 @@ import ( "io/ioutil" "log" "path" - "strings" "github.com/hashicorp/go-getter" "gopkg.in/yaml.v3" @@ -98,9 +97,9 @@ func enforceMapKeys(input map[string]interface{}, allowedKeys map[string]bool) e // ModuleInfo stores information about a module type ModuleInfo struct { - Inputs []VarInfo - Outputs []OutputInfo - RequiredApis []string + Inputs []VarInfo + Outputs []OutputInfo + Metadata Metadata } // GetOutputsAsMap returns the outputs list as a map for quicker access @@ -131,7 +130,9 @@ func GetModuleInfo(source string, kind string) (ModuleInfo, error) { var modPath string switch { - case sourcereader.IsGitPath(source): + case sourcereader.IsEmbeddedPath(source) || sourcereader.IsLocalPath(source): + modPath = source + default: tmpDir, err := ioutil.TempDir("", "module-*") if err != nil { return ModuleInfo{}, err @@ -141,19 +142,12 @@ func GetModuleInfo(source string, kind string) (ModuleInfo, error) { modPath = path.Join(pkgPath, subDir) sourceReader := sourcereader.Factory(pkgAddr) if err = sourceReader.GetModule(pkgAddr, pkgPath); err != nil { - if subDir != "" { + if subDir != "" && kind == "packer" { err = fmt.Errorf("module source %s included \"//\" package syntax; "+ "the \"//\" should typically be placed at the root of the repository:\n%w", source, err) - } return ModuleInfo{}, err } - - case sourcereader.IsEmbeddedPath(source) || sourcereader.IsLocalPath(source): - modPath = source - - default: - return ModuleInfo{}, fmt.Errorf("source is not valid: %s", source) } reader := Factory(kind) @@ -161,18 +155,7 @@ func GetModuleInfo(source string, kind string) (ModuleInfo, error) { if err != nil { return ModuleInfo{}, err } - - // add APIs required by the module, if known - if sourcereader.IsEmbeddedPath(source) { - mi.RequiredApis = defaultAPIList(modPath) - } else if sourcereader.IsLocalPath(source) { - if idx := strings.Index(modPath, "/community/modules/"); idx != -1 { - mi.RequiredApis = defaultAPIList(modPath[idx+1:]) - } else if idx := strings.Index(modPath, "/modules/"); idx != -1 { - mi.RequiredApis = defaultAPIList(modPath[idx+1:]) - } - } - + mi.Metadata = GetMetadataSafe(modPath) modInfoCache[key] = mi return mi, nil } @@ -201,159 +184,3 @@ func Factory(kind string) ModReader { } return r } - -func defaultAPIList(source string) []string { - // API lists at - // https://console.cloud.google.com/apis/dashboard and - // https://console.cloud.google.com/apis/library - staticAPIMap := map[string][]string{ - "community/modules/compute/SchedMD-slurm-on-gcp-partition": { - "compute.googleapis.com", - }, - "community/modules/compute/htcondor-execute-point": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/compute/pbspro-execution": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/compute/schedmd-slurm-gcp-v5-partition": { - "compute.googleapis.com", - }, - "community/modules/database/slurm-cloudsql-federation": { - "bigqueryconnection.googleapis.com", - "sqladmin.googleapis.com", - }, - "community/modules/file-system/DDN-EXAScaler": { - "compute.googleapis.com", - "deploymentmanager.googleapis.com", - "iam.googleapis.com", - "runtimeconfig.googleapis.com", - }, - "community/modules/file-system/Intel-DAOS": { - "compute.googleapis.com", - "iam.googleapis.com", - "secretmanager.googleapis.com", - }, - "community/modules/file-system/nfs-server": { - "compute.googleapis.com", - }, - "community/modules/project/new-project": { - "admin.googleapis.com", - "cloudresourcemanager.googleapis.com", - "cloudbilling.googleapis.com", - "iam.googleapis.com", - }, - "community/modules/project/service-account": { - "iam.googleapis.com", - }, - "community/modules/project/service-enablement": { - "serviceusage.googleapis.com", - }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-controller": { - "compute.googleapis.com", - }, - "community/modules/scheduler/SchedMD-slurm-on-gcp-login-node": { - "compute.googleapis.com", - }, - "community/modules/compute/gke-node-pool": { - "container.googleapis.com", - }, - "community/modules/scheduler/gke-cluster": { - "container.googleapis.com", - }, - "modules/scheduler/batch-job-template": { - "batch.googleapis.com", - "compute.googleapis.com", - }, - "modules/scheduler/batch-login-node": { - "batch.googleapis.com", - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/htcondor-access-point": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/htcondor-central-manager": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/htcondor-pool-secrets": { - "iam.googleapis.com", - "secretmanager.googleapis.com", - }, - "community/modules/scheduler/htcondor-setup": { - "iam.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/pbspro-client": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/pbspro-server": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-controller": { - "compute.googleapis.com", - "iam.googleapis.com", - "pubsub.googleapis.com", - "secretmanager.googleapis.com", - }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid": { - "compute.googleapis.com", - "pubsub.googleapis.com", - }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-login": { - "compute.googleapis.com", - }, - "community/modules/scripts/htcondor-install": {}, - "community/modules/scripts/omnia-install": {}, - "community/modules/scripts/pbspro-preinstall": { - "iam.googleapis.com", - "storage.googleapis.com", - }, - "community/modules/scripts/pbspro-install": {}, - "community/modules/scripts/pbspro-qmgr": {}, - "community/modules/scripts/spack-setup": { - "storage.googleapis.com", - }, - "community/modules/scripts/wait-for-startup": { - "compute.googleapis.com", - }, - "modules/compute/vm-instance": { - "compute.googleapis.com", - }, - "modules/file-system/filestore": { - "file.googleapis.com", - }, - "modules/file-system/cloud-storage-bucket": { - "storage.googleapis.com", - }, - "modules/file-system/pre-existing-network-storage": {}, - "modules/monitoring/dashboard": { - "stackdriver.googleapis.com", - }, - "modules/network/pre-existing-vpc": { - "compute.googleapis.com", - }, - "modules/network/vpc": { - "compute.googleapis.com", - }, - "modules/packer/custom-image": { - "compute.googleapis.com", - "storage.googleapis.com", - }, - "modules/scripts/startup-script": { - "storage.googleapis.com", - }, - } - - requiredAPIs, found := staticAPIMap[source] - if !found { - return []string{} - } - return requiredAPIs -} diff --git a/pkg/modulereader/resreader_test.go b/pkg/modulereader/resreader_test.go index e9b17d2240..7f0bca2eae 100644 --- a/pkg/modulereader/resreader_test.go +++ b/pkg/modulereader/resreader_test.go @@ -31,26 +31,6 @@ import ( const ( pkrKindString = "packer" tfKindString = "terraform" - testMainTf = ` -module "test_module" { - source = "testSource" -} -data "test_data" "test_data_name" { - name = "test_data_name" -} -` - testVariablesTf = ` -variable "test_variable" { - description = "This is just a test" - type = string -} -` - testOutputsTf = ` -output "test_output" { - description = "This is just a test" - value = "test_value" -} -` ) var ( @@ -96,23 +76,37 @@ func (s *MySuite) TestFactory(c *C) { func (s *MySuite) TestGetModuleInfo_Embedded(c *C) { sourcereader.ModuleFS = testModuleFS - // Success - moduleInfo, err := GetModuleInfo("modules/test_role/test_module", tfKindString) - c.Assert(err, IsNil) - c.Assert(moduleInfo.Inputs[0].Name, Equals, "test_variable") - c.Assert(moduleInfo.Outputs[0].Name, Equals, "test_output") + { // Success + mi, err := GetModuleInfo("modules/test_role/test_module", tfKindString) + c.Assert(err, IsNil) + c.Check(mi, DeepEquals, ModuleInfo{ + Inputs: []VarInfo{{ + Name: "test_variable", + Type: "string", + Description: "This is just a test", + Required: true}}, + Outputs: []OutputInfo{{ + Name: "test_output", + Description: "This is just a test", + Sensitive: false}}, + Metadata: Metadata{ + Spec: MetadataSpec{ + Requirements: MetadataRequirements{ + Services: []string{ + "room.service.vip", + "protection.service.GCPD", + }}}}}) + } - // Invalid: No embedded modules - badEmbeddedMod := "modules/does/not/exist" - moduleInfo, err = GetModuleInfo(badEmbeddedMod, tfKindString) - expectedErr := "failed to get info using tfconfig for terraform module at .*" - c.Assert(err, ErrorMatches, expectedErr) + { // Invalid: No embedded modules + _, err := GetModuleInfo("modules/does/not/exist", tfKindString) + c.Check(err, ErrorMatches, "failed to get info using tfconfig for terraform module at .*") + } - // Invalid: Unsupported Module Source - badSource := "gcs::https://www.googleapis.com/storage/v1/GoogleCloudPlatform/hpc-toolkit/modules" - moduleInfo, err = GetModuleInfo(badSource, tfKindString) - expectedErr = "source is not valid: .*" - c.Assert(err, ErrorMatches, expectedErr) + { // Invalid: Unsupported source + _, err := GetModuleInfo("wut::hpc-toolkit/modules", tfKindString) + c.Check(err, NotNil) + } } func (s *MySuite) TestGetModuleInfo_Git(c *C) { @@ -120,35 +114,46 @@ func (s *MySuite) TestGetModuleInfo_Git(c *C) { // Invalid git repository - path does not exists badGitRepo := "github.com:not/exist.git" _, err := GetModuleInfo(badGitRepo, tfKindString) - expectedErr := "failed to clone git module at .*" - c.Assert(err, ErrorMatches, expectedErr) + c.Check(err, NotNil) // Invalid: Unsupported Module Source badSource := "gcs::https://www.googleapis.com/storage/v1/GoogleCloudPlatform/hpc-toolkit/modules" _, err = GetModuleInfo(badSource, tfKindString) - expectedErr = "source is not valid: .*" - c.Assert(err, ErrorMatches, expectedErr) + c.Check(err, NotNil) } func (s *MySuite) TestGetModuleInfo_Local(c *C) { + { // Success + mi, err := GetModuleInfo(terraformDir, tfKindString) + c.Assert(err, IsNil) + c.Check(mi, DeepEquals, ModuleInfo{ + Inputs: []VarInfo{{ + Name: "test_variable", + Type: "string", + Description: "This is just a test", + Required: true}}, + Outputs: []OutputInfo{{ + Name: "test_output", + Description: "This is just a test", + Sensitive: false}}, + Metadata: Metadata{ + Spec: MetadataSpec{ + Requirements: MetadataRequirements{ + Services: []string{ + "room.service.vip", + "protection.service.GCPD", + }}}}}) + } - // Success - moduleInfo, err := GetModuleInfo(terraformDir, tfKindString) - c.Assert(err, IsNil) - c.Assert(moduleInfo.Inputs[0].Name, Equals, "test_variable") - c.Assert(moduleInfo.Outputs[0].Name, Equals, "test_output") - - // Invalid source path - path does not exists - badLocalMod := "./not/a/real/path" - moduleInfo, err = GetModuleInfo(badLocalMod, tfKindString) - expectedErr := "failed to get info using tfconfig for terraform module at .*" - c.Assert(err, ErrorMatches, expectedErr) + { // Invalid source path - path does not exists + _, err := GetModuleInfo("./not/a/real/path", tfKindString) + c.Assert(err, ErrorMatches, "failed to get info using tfconfig for terraform module at .*") + } - // Invalid: Unsupported Module Source - badSource := "gcs::https://www.googleapis.com/storage/v1/GoogleCloudPlatform/hpc-toolkit/modules" - moduleInfo, err = GetModuleInfo(badSource, tfKindString) - expectedErr = "source is not valid: .*" - c.Assert(err, ErrorMatches, expectedErr) + { // Invalid: Unsupported Module Source + _, err := GetModuleInfo("wut:://hpc-toolkit/modules", tfKindString) + c.Assert(err, NotNil) + } } func (s *MySuite) TestGetHCLInfo(c *C) { @@ -188,17 +193,25 @@ func (s *MySuite) TestGetInfo_TFReder(c *C) { // packerreader.go func (s *MySuite) TestGetInfo_PackerReader(c *C) { - // Didn't already exist, succeeds reader := NewPackerReader() - info, err := reader.GetInfo(packerDir) - c.Assert(err, IsNil) - c.Check(info, DeepEquals, ModuleInfo{ - Inputs: []VarInfo{{Name: "test_variable", Type: "string", Description: "This is just a test", Required: true}}}) + exp := ModuleInfo{ + Inputs: []VarInfo{{ + Name: "test_variable", + Type: "string", + Description: "This is just a test", + Required: true}}} + + { // Didn't already exist, succeeds + info, err := reader.GetInfo(packerDir) + c.Assert(err, IsNil) + c.Check(info, DeepEquals, exp) + } - // Already exists, succeeds - infoAgain, err := reader.GetInfo(packerDir) - c.Assert(err, IsNil) - c.Check(infoAgain, DeepEquals, info) + { // Already exists, succeeds + info, err := reader.GetInfo(packerDir) + c.Assert(err, IsNil) + c.Check(info, DeepEquals, exp) + } } // metareader.go @@ -250,84 +263,24 @@ func (s *MySuite) TestUnmarshalOutputInfo(c *C) { } // Util Functions -func createTmpModule() { +func copyEmbeddedModules() { var err error - tmpModuleDir, err = ioutil.TempDir("", "modulereader_tests_*") - if err != nil { + if tmpModuleDir, err = ioutil.TempDir("", "modulereader_tests_*"); err != nil { log.Fatalf( "Failed to create temp dir for module in modulereader_test, %v", err) } - - // Create terraform module dir - terraformDir = filepath.Join(tmpModuleDir, "terraformModule") - err = os.Mkdir(terraformDir, 0755) - if err != nil { - log.Fatalf("error creating test terraform module dir: %e", err) - } - - // main.tf file - mainFile, err := os.Create(filepath.Join(terraformDir, "main.tf")) - if err != nil { - log.Fatalf("Failed to create main.tf: %v", err) - } - _, err = mainFile.WriteString(testMainTf) - if err != nil { - log.Fatalf("modulereader_test: Failed to write main.tf test file. %v", err) - } - - // variables.tf file - varFile, err := os.Create(filepath.Join(terraformDir, "variables.tf")) - if err != nil { - log.Fatalf("Failed to create variables.tf: %v", err) - } - _, err = varFile.WriteString(testVariablesTf) - if err != nil { - log.Fatalf( - "modulereader_test: Failed to write variables.tf test file. %v", err) - } - - // outputs.tf file - outFile, err := os.Create(filepath.Join(terraformDir, "outputs.tf")) - if err != nil { - log.Fatalf("Failed to create outputs.tf: %v", err) - } - _, err = outFile.WriteString(testOutputsTf) - if err != nil { - log.Fatalf("modulereader_test: Failed to write outputs.tf test file. %v", err) - } - - // Create packer module dir - packerDir = filepath.Join(tmpModuleDir, "packerModule") - err = os.Mkdir(packerDir, 0755) - if err != nil { - log.Fatalf("error creating test packer module dir: %e", err) - } - - // main.pkr.hcl file - mainFile, err = os.Create(filepath.Join(packerDir, "main.pkr.hcl")) - if err != nil { - log.Fatalf("Failed to create main.pkr.hcl: %v", err) - } - _, err = mainFile.WriteString(testMainTf) - if err != nil { - log.Fatalf("modulereader_test: Failed to write main.pkr.hcl test file. %v", err) + sourcereader.ModuleFS = testModuleFS + rdr := sourcereader.EmbeddedSourceReader{} + if err = rdr.CopyDir("modules", tmpModuleDir); err != nil { + log.Fatalf("failed to copy embedded modules, %v", err) } - // variables.pkr.hcl file - varFile, err = os.Create(filepath.Join(packerDir, "variables.pkr.hcl")) - if err != nil { - log.Fatalf("Failed to create variables.pkr.hcl: %v", err) - } - _, err = varFile.WriteString(testVariablesTf) - if err != nil { - log.Fatalf( - "modulereader_test: Failed to write variables.pkr.hcl test file. %v", err) - } + terraformDir = filepath.Join(tmpModuleDir, "test_role", "test_module") + packerDir = filepath.Join(tmpModuleDir, "imaginarium", "zebra") } func teardownTmpModule() { - err := os.RemoveAll(tmpModuleDir) - if err != nil { + if err := os.RemoveAll(tmpModuleDir); err != nil { log.Fatalf( "modulereader_test: Failed to delete contents of test directory %s, %v", tmpModuleDir, err) @@ -335,7 +288,7 @@ func teardownTmpModule() { } func TestMain(m *testing.M) { - createTmpModule() + copyEmbeddedModules() code := m.Run() teardownTmpModule() os.Exit(code) diff --git a/pkg/modulewriter/modulewriter.go b/pkg/modulewriter/modulewriter.go index 06819957ac..6e2aa6c1ad 100644 --- a/pkg/modulewriter/modulewriter.go +++ b/pkg/modulewriter/modulewriter.go @@ -137,9 +137,9 @@ func createGroupDirs(deploymentPath string, deploymentGroups *[]config.Deploymen // DeploymentSource returns module source within deployment group // Rules are following: -// - git source -// - terraform => -// - packer => / +// - remote source +// = terraform => +// = packer => / // - packer // => // - embedded (source starts with "modules" or "comunity/modules") @@ -158,25 +158,23 @@ func DeploymentSource(mod config.Module) (string, error) { } func tfDeploymentSource(mod config.Module) (string, error) { - if sourcereader.IsGitPath(mod.Source) { - return mod.Source, nil - } - if sourcereader.IsEmbeddedPath(mod.Source) { + switch { + case sourcereader.IsEmbeddedPath(mod.Source): return "./modules/" + filepath.Join("embedded", mod.Source), nil + case sourcereader.IsLocalPath(mod.Source): + abs, err := filepath.Abs(mod.Source) + if err != nil { + return "", fmt.Errorf("failed to get absolute path for %#v: %v", mod.Source, err) + } + base := filepath.Base(mod.Source) + return fmt.Sprintf("./modules/%s-%s", base, shortHash(abs)), nil + default: + return mod.Source, nil } - if !sourcereader.IsLocalPath(mod.Source) { - return "", fmt.Errorf("unexpected module source %s", mod.Source) - } - abs, err := filepath.Abs(mod.Source) - if err != nil { - return "", fmt.Errorf("failed to get absolute path for %#v: %v", mod.Source, err) - } - base := filepath.Base(mod.Source) - return fmt.Sprintf("./modules/%s-%s", base, shortHash(abs)), nil } func packerDeploymentSource(mod config.Module) string { - if sourcereader.IsGitPath(mod.Source) { + if sourcereader.IsRemotePath(mod.Source) { _, subDir := getter.SourceDirSubdir(mod.Source) return filepath.Join(string(mod.ID), subDir) } @@ -217,18 +215,21 @@ func copySource(deploymentPath string, deploymentGroups *[]config.DeploymentGrou return err } - if sourcereader.IsGitPath(deplSource) { - continue // remote deployment source means that terraform will download it, no op - } - if sourcereader.IsEmbeddedPath(mod.Source) && mod.Kind == config.TerraformKind { - copyEmbedded = true - continue // all embedded terraform modules fill be copied at once + if mod.Kind == config.TerraformKind { + // some terraform modules do not require copying + if sourcereader.IsEmbeddedPath(mod.Source) { + copyEmbedded = true + continue // all embedded terraform modules fill be copied at once + } + if sourcereader.IsRemotePath(mod.Source) { + continue // will be downloaded by terraform + } } /* Copy source files */ var src, dst string - if sourcereader.IsGitPath(mod.Source) && mod.Kind == config.PackerKind { + if sourcereader.IsRemotePath(mod.Source) && mod.Kind == config.PackerKind { src, _ = getter.SourceDirSubdir(mod.Source) dst = filepath.Join(basePath, string(mod.ID)) } else { diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go index fc39cb8f50..6c1992034b 100644 --- a/pkg/modulewriter/tfversions.go +++ b/pkg/modulewriter/tfversions.go @@ -21,11 +21,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.83.0" + version = "~> 4.84.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.83.0" + version = "~> 4.84.0" } } } diff --git a/pkg/shell/packer.go b/pkg/shell/packer.go index 0d51fe3baa..776ca9bbdf 100644 --- a/pkg/shell/packer.go +++ b/pkg/shell/packer.go @@ -29,7 +29,7 @@ func ConfigurePacker() error { _, err := exec.LookPath("packer") if err != nil { return &TfError{ - help: "must have a copy of packer installed in PATH", + help: "must have a copy of packer installed in PATH (obtain at https://packer.io)", err: err, } } diff --git a/pkg/shell/terraform.go b/pkg/shell/terraform.go index bf645ae81e..fef794921f 100644 --- a/pkg/shell/terraform.go +++ b/pkg/shell/terraform.go @@ -52,7 +52,7 @@ type TfError struct { } func (se *TfError) Error() string { - return fmt.Sprintf("%s (detailed error below)\n%s", se.help, se.err) + return fmt.Sprintf("%s\n%s", se.help, se.err) } type outputValue struct { @@ -67,7 +67,7 @@ func ConfigureTerraform(workingDir string) (*tfexec.Terraform, error) { path, err := exec.LookPath("terraform") if err != nil { return nil, &TfError{ - help: "must have a copy of terraform installed in PATH", + help: "must have a copy of terraform installed in PATH (obtain at https://terraform.io)", err: err, } } diff --git a/pkg/sourcereader/embedded.go b/pkg/sourcereader/embedded.go index e681755950..d89ad678fc 100644 --- a/pkg/sourcereader/embedded.go +++ b/pkg/sourcereader/embedded.go @@ -28,12 +28,11 @@ import ( // hpc-toolkit/modules are not accessible at the package level. var ModuleFS BaseFS -// BaseFS is an extension of the fs.FS interface with the functionality needed -// in CopyDirFromModules. Works with embed.FS and afero.FS +// BaseFS is an join interface with the functionality needed +// in copyDirFromModules. Works with embed.FS and afero.FS type BaseFS interface { - fs.FS - ReadDir(string) ([]fs.DirEntry, error) - ReadFile(string) ([]byte, error) + fs.ReadDirFS + fs.ReadFileFS } // EmbeddedSourceReader reads modules from a local directory diff --git a/pkg/sourcereader/git.go b/pkg/sourcereader/git.go deleted file mode 100644 index dcc1ef06a9..0000000000 --- a/pkg/sourcereader/git.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package sourcereader - -import ( - "context" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "time" - - "github.com/hashicorp/go-getter" -) - -var goGetterDetectors = []getter.Detector{ - new(getter.GitHubDetector), - new(getter.GitDetector), -} - -var goGetterGetters = map[string]getter.Getter{ - "git": &getter.GitGetter{Timeout: 5 * time.Minute}, -} - -var goGetterDecompressors = map[string]getter.Decompressor{} - -// GitSourceReader reads modules from a git repository -type GitSourceReader struct{} - -func copyGitModules(srcPath string, destPath string) error { - client := getter.Client{ - Src: srcPath, - Dst: destPath, - Pwd: destPath, - - Mode: getter.ClientModeDir, - - Detectors: goGetterDetectors, - Decompressors: goGetterDecompressors, - Getters: goGetterGetters, - Ctx: context.Background(), - } - err := client.Get() - return err -} - -// GetModule copies the git source to a provided destination (the deployment directory) -func (r GitSourceReader) GetModule(modPath string, copyPath string) error { - if !IsGitPath(modPath) { - return fmt.Errorf("source is not valid: %s", modPath) - } - - modDir, err := ioutil.TempDir("", "git-module-*") - defer os.RemoveAll(modDir) - writeDir := filepath.Join(modDir, "mod") - if err != nil { - return err - } - - if err := copyGitModules(modPath, writeDir); err != nil { - return fmt.Errorf("failed to clone git module at %s to tmp dir %s: %v", - modPath, writeDir, err) - } - - return copyFromPath(writeDir, copyPath) -} diff --git a/pkg/sourcereader/goget.go b/pkg/sourcereader/goget.go new file mode 100644 index 0000000000..e5abe44a26 --- /dev/null +++ b/pkg/sourcereader/goget.go @@ -0,0 +1,72 @@ +// Copyright 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sourcereader + +import ( + "context" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "time" + + "github.com/hashicorp/go-getter" +) + +// GoGetterSourceReader reads modules from a git repository +type GoGetterSourceReader struct{} + +func getterClient(source string, dst string) getter.Client { + return getter.Client{ + Src: source, + Dst: dst, + Pwd: dst, + + //Mode: getter.ClientModeDir, + Mode: getter.ClientModeAny, + + Detectors: []getter.Detector{ + new(getter.GitHubDetector), + new(getter.GitDetector), + new(getter.GCSDetector), + }, + Getters: map[string]getter.Getter{ + "git": &getter.GitGetter{Timeout: 5 * time.Minute}, + "gcs": &getter.GCSGetter{Timeout: 5 * time.Minute}, + }, + + // Disable decompression (e.g. tar, zip) by supplying no decompressors + Decompressors: map[string]getter.Decompressor{}, + Ctx: context.Background(), + } +} + +// GetModule copies the git source to a provided destination (the deployment directory) +func (r GoGetterSourceReader) GetModule(source string, dst string) error { + tmp, err := ioutil.TempDir("", "get-module-*") + defer os.RemoveAll(tmp) + if err != nil { + return err + } + + writeDir := filepath.Join(tmp, "mod") + client := getterClient(source, writeDir) + + if err := client.Get(); err != nil { + return fmt.Errorf("failed to get module at %s to %s: %w", source, writeDir, err) + } + + return copyFromPath(writeDir, dst) +} diff --git a/pkg/sourcereader/git_test.go b/pkg/sourcereader/goget_test.go similarity index 78% rename from pkg/sourcereader/git_test.go rename to pkg/sourcereader/goget_test.go index 4160c6fa72..e076a6d668 100644 --- a/pkg/sourcereader/git_test.go +++ b/pkg/sourcereader/goget_test.go @@ -28,10 +28,11 @@ func (s *MySuite) TestCopyGitModules(c *C) { if err := os.Mkdir(destDir, 0755); err != nil { log.Fatal(err) } + reader := GoGetterSourceReader{} // Success via HTTPS destDirForHTTPS := filepath.Join(destDir, "https") - err := copyGitModules("github.com/terraform-google-modules/terraform-google-project-factory//helpers", destDirForHTTPS) + err := reader.GetModule("github.com/terraform-google-modules/terraform-google-project-factory//helpers", destDirForHTTPS) c.Assert(err, IsNil) fInfo, err := os.Stat(filepath.Join(destDirForHTTPS, "terraform_validate")) c.Assert(err, IsNil) @@ -41,7 +42,7 @@ func (s *MySuite) TestCopyGitModules(c *C) { // Success via HTTPS (Root directory) destDirForHTTPSRootDir := filepath.Join(destDir, "https-rootdir") - err = copyGitModules("github.com/terraform-google-modules/terraform-google-service-accounts.git?ref=v4.1.1", destDirForHTTPSRootDir) + err = reader.GetModule("github.com/terraform-google-modules/terraform-google-service-accounts.git?ref=v4.1.1", destDirForHTTPSRootDir) c.Assert(err, IsNil) fInfo, err = os.Stat(filepath.Join(destDirForHTTPSRootDir, "main.tf")) c.Assert(err, IsNil) @@ -51,17 +52,15 @@ func (s *MySuite) TestCopyGitModules(c *C) { } func (s *MySuite) TestGetModule_Git(c *C) { - reader := GitSourceReader{} + reader := GoGetterSourceReader{} // Invalid git repository - path does not exists badGitRepo := "github.com:not/exist.git" err := reader.GetModule(badGitRepo, tfKindString) - expectedErr := "failed to clone git module at .*" - c.Assert(err, ErrorMatches, expectedErr) + c.Assert(err, NotNil) // Invalid: Unsupported Module Source - badSource := "gcs::https://www.googleapis.com/storage/v1/GoogleCloudPlatform/hpc-toolkit/modules" + badSource := "wut::https://www.googleapis.com/storage/v1/GoogleCloudPlatform/hpc-toolkit/modules" err = reader.GetModule(badSource, tfKindString) - expectedErr = "source is not valid: .*" - c.Assert(err, ErrorMatches, expectedErr) + c.Assert(err, NotNil) } diff --git a/pkg/sourcereader/local.go b/pkg/sourcereader/local.go index a8744cb6f5..bb3c5a5948 100644 --- a/pkg/sourcereader/local.go +++ b/pkg/sourcereader/local.go @@ -29,7 +29,7 @@ func (r LocalSourceReader) GetModule(modPath string, copyPath string) error { } if _, err := os.Stat(modPath); os.IsNotExist(err) { - return fmt.Errorf("Local module doesn't exist at %s", modPath) + return fmt.Errorf("local module doesn't exist at %s", modPath) } return copyFromPath(modPath, copyPath) diff --git a/pkg/sourcereader/local_test.go b/pkg/sourcereader/local_test.go index 3bc1330913..c70cf91814 100644 --- a/pkg/sourcereader/local_test.go +++ b/pkg/sourcereader/local_test.go @@ -116,7 +116,7 @@ func (s *MySuite) TestGetModule_Local(c *C) { // Invalid: No local module badLocalMod := "./modules/does/not/exist" err = reader.GetModule(badLocalMod, dest) - expectedErr = "Local module doesn't exist at .*" + expectedErr = "local module doesn't exist at .*" c.Assert(err, ErrorMatches, expectedErr) // Invalid: Unsupported Module Source by LocalSourceReader diff --git a/pkg/sourcereader/sourcereader.go b/pkg/sourcereader/sourcereader.go index 0ec5728634..3913a80155 100644 --- a/pkg/sourcereader/sourcereader.go +++ b/pkg/sourcereader/sourcereader.go @@ -16,7 +16,6 @@ package sourcereader import ( "hpc-toolkit/pkg/deploymentio" - "log" "strings" ) @@ -38,34 +37,21 @@ func IsEmbeddedPath(source string) bool { return strings.HasPrefix(source, "modules/") || strings.HasPrefix(source, "community/modules/") } -// IsGitPath checks if a source path points to GitHub or has the git:: prefix -func IsGitPath(source string) bool { - return strings.HasPrefix(source, "github.com") || - strings.HasPrefix(source, "git@github.com") || - strings.HasPrefix(source, "git::") +// IsRemotePath checks if path neither Local nor Embedded +func IsRemotePath(source string) bool { + return !IsLocalPath(source) && !IsEmbeddedPath(source) } // Factory returns a SourceReader of module path func Factory(modPath string) SourceReader { - validPrefixes := []string{ - "/", "./", "../", - "modules/", "community/modules/", - "git@", "github.com", - } switch { case IsLocalPath(modPath): return LocalSourceReader{} case IsEmbeddedPath(modPath): return EmbeddedSourceReader{} - case IsGitPath(modPath): - return GitSourceReader{} default: - log.Fatalf( - "Source (%s) not valid, must begin with one of: %s", - modPath, strings.Join(validPrefixes, ", ")) + return GoGetterSourceReader{} } - - return nil } func copyFromPath(modPath string, copyPath string) error { diff --git a/pkg/sourcereader/sourcereader_test.go b/pkg/sourcereader/sourcereader_test.go index 489a5df983..42b51380e6 100644 --- a/pkg/sourcereader/sourcereader_test.go +++ b/pkg/sourcereader/sourcereader_test.go @@ -109,46 +109,50 @@ func (s *MySuite) TestIsLocalPath(c *C) { c.Assert(ret, Equals, false) } -func (s *MySuite) TestIsGitRepository(c *C) { +func (s *MySuite) TestIsRemotePath(c *C) { // False: Is an embedded path - ret := IsGitPath("modules/anything/else") - c.Assert(ret, Equals, false) + ret := IsRemotePath("modules/anything/else") + c.Check(ret, Equals, false) // False: Local path - ret = IsGitPath("./anything/else") - c.Assert(ret, Equals, false) + ret = IsRemotePath("./anything/else") + c.Check(ret, Equals, false) - ret = IsGitPath("./modules") - c.Assert(ret, Equals, false) + ret = IsRemotePath("./modules") + c.Check(ret, Equals, false) - ret = IsGitPath("../modules/") - c.Assert(ret, Equals, false) + ret = IsRemotePath("../modules/") + c.Check(ret, Equals, false) // True, other - ret = IsGitPath("github.com/modules") - c.Assert(ret, Equals, true) + ret = IsRemotePath("github.com/modules") + c.Check(ret, Equals, true) // True, genetic git repository - ret = IsGitPath("git::https://gitlab.com/modules") - c.Assert(ret, Equals, true) + ret = IsRemotePath("git::https://gitlab.com/modules") + c.Check(ret, Equals, true) + + // True, invalid path though nor local nor embedded + ret = IsRemotePath("wut:://modules") + c.Check(ret, Equals, true) } func (s *MySuite) TestFactory(c *C) { // Local modules locSrcReader := Factory("./modules/anything/else") - c.Assert(reflect.TypeOf(locSrcReader), Equals, reflect.TypeOf(LocalSourceReader{})) + c.Check(reflect.TypeOf(locSrcReader), Equals, reflect.TypeOf(LocalSourceReader{})) // Embedded modules embSrcReader := Factory("modules/anything/else") - c.Assert(reflect.TypeOf(embSrcReader), Equals, reflect.TypeOf(EmbeddedSourceReader{})) + c.Check(reflect.TypeOf(embSrcReader), Equals, reflect.TypeOf(EmbeddedSourceReader{})) // GitHub modules ghSrcString := Factory("github.com/modules") - c.Assert(reflect.TypeOf(ghSrcString), Equals, reflect.TypeOf(GitSourceReader{})) + c.Check(reflect.TypeOf(ghSrcString), Equals, reflect.TypeOf(GoGetterSourceReader{})) // Git modules gitSrcString := Factory("git::https://gitlab.com/modules") - c.Assert(reflect.TypeOf(gitSrcString), Equals, reflect.TypeOf(GitSourceReader{})) + c.Check(reflect.TypeOf(gitSrcString), Equals, reflect.TypeOf(GoGetterSourceReader{})) } func (s *MySuite) TestCopyFromPath(c *C) { diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 26da80fba4..a798e75e74 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -311,7 +311,8 @@ func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { } apis := map[string]bool{} bp.WalkModules(func(m *config.Module) error { - for _, api := range m.InfoOrDie().RequiredApis { + services := m.InfoOrDie().Metadata.Spec.Requirements.Services + for _, api := range services { apis[api] = true } return nil diff --git a/tools/clean-metadata.sh b/tools/clean-metadata.sh new file mode 100755 index 0000000000..f08c9074f0 --- /dev/null +++ b/tools/clean-metadata.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Script checks for any slurm metadata that does not have at least one +# corresponding VM instance. We can assume the metadata is orphaned at that +# point and delete it. + +set -e + +# Check for project id +PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value project)} +if [ -z "$PROJECT_ID" ]; then + echo "PROJECT_ID must be defined" + exit 1 +fi +echo "Checking for left over compute metadata for $PROJECT_ID" + +# Gather gcloud data - specifically checking for keys that match the slurm metadata pattern +KEYS=$(gcloud compute --project "${PROJECT_ID}" project-info describe --flatten="commonInstanceMetadata[]" | grep "key: .*-slurm-.*" | sed 's/.*: //') +INSTANCES=$(gcloud compute instances list) + +# Loop through metadata keys and check for keys that don't have corresponding instances running +if [ "${KEYS}" != "" ]; then + while read -r key; do + CLUSTER_NAME=${key/-*/} + if [ "$(echo "${INSTANCES}" | grep "${CLUSTER_NAME}")" == "" ]; then + echo "Metadata '${key}' has no corresponding cluster, removing." + if [ -z "${KEYS_TO_DELETE}" ]; then + KEYS_TO_DELETE="${key}" + else + KEYS_TO_DELETE="${KEYS_TO_DELETE},${key}" + fi + fi + done <<<"${KEYS}" +fi + +# Delete keys that are orphaned +if [ -v KEYS_TO_DELETE ]; then + echo "Running gcloud compute project-info remove-metadata --keys=\"${KEYS_TO_DELETE}\"" + gcloud compute project-info remove-metadata --keys="${KEYS_TO_DELETE}" +fi diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index ccea2e89fa..64449e1366 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -168,7 +168,6 @@ - name: Trigger failure (rescue blocks otherwise revert failures) ansible.builtin.fail: msg: "Failed while setting up test infrastructure" - when: true - name: Run Integration Tests hosts: remote_host @@ -210,6 +209,20 @@ loop_control: loop_var: test + rescue: + - name: Check for stockout errors + ansible.builtin.include_tasks: + file: tasks/check_stockout.yml + apply: + delegate_to: localhost + vars: + deployment_name: "{{ deployment_name }}" + project: "{{ project }}" + + - name: Trigger failure (rescue blocks otherwise revert failures) + ansible.builtin.fail: + msg: "Failed during the integration tests (see above)" + ## Always cleanup, even on failure always: - name: Ensure all nodes are powered down @@ -223,6 +236,7 @@ - name: Get partition info after test ansible.builtin.command: sinfo + failed_when: False changed_when: False register: partition_post_run_output diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/check_stockout.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/check_stockout.yml new file mode 100644 index 0000000000..5b03964672 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/check_stockout.yml @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Assert variables are defined + ansible.builtin.assert: + that: + - project is defined + - deployment_name is defined + +- name: Check logs for stockout on compute nodes + changed_when: false + register: stockout + until: stockout.rc == 0 + retries: 5 + delay: 10 + ansible.builtin.command: >- + gcloud logging --project {{ project }} read + 'protoPayload.response.error.errors.message="{{ item }}" AND protoPayload.request.instanceProperties.labels.value="{{ deployment_name }}"' + --flatten="protoPayload.request.perInstanceProperties" + --format='table(protoPayload.request.perInstanceProperties.key:label=INSTANCE_ID,protoPayload.response.error.errors.message.list():label=ERROR_MESSAGE)' + --limit=5 + with_items: + - Region does not currently have sufficient capacity for the requested resources. + - No eligible zone could be found in this region for given properties + +- name: Log compute stockout error + ansible.builtin.debug: + msg: | + "Abbreviated listing of nodes that could not be created:" + "{{ item.stdout }}" + when: item.stdout != "" + with_items: "{{ stockout.results }}" + loop_control: + label: "{{ item.item }}" diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml index 43eec01885..4f6d93eca8 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml @@ -27,7 +27,7 @@ - name: Create cli flag for extra deployment variables ansible.builtin.set_fact: - deployment_vars_str: "--vars \\\"{{ cli_deployment_vars.items() | map('join', '=') | join('\\\" --vars \\\"') }}\\\"" + deployment_vars_str: "--vars \"\\\"{{ cli_deployment_vars.items() | map('join', '=') | join('\\\"\" --vars \"\\\"') }}\\\"\"" when: cli_deployment_vars is defined and cli_deployment_vars is mapping - name: Create Blueprint diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml new file mode 100644 index 0000000000..1e2570a0e8 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-debian.yaml @@ -0,0 +1,54 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +## Test Slurm v5 Debian Example +- id: slurm-gcp-v5-debian + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml" diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml new file mode 100644 index 0000000000..ae2d7c38e3 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v5-rocky8.yaml @@ -0,0 +1,54 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +## Test Slurm v5 rocky8 Example +- id: slurm-gcp-v5-rocky8 + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml" diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml new file mode 100644 index 0000000000..b0037d7a06 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml @@ -0,0 +1,55 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: [slurm6, rocky8] +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +## Test Slurm v6 rocky8 Example +- id: slurm-gcp-v6-rocky8 + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml" diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml new file mode 100644 index 0000000000..6d4872d4ac --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml @@ -0,0 +1,44 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: hpc-slurm-debian +deployment_name: "debi-v5-{{ build }}" +# Manually adding the slurm_cluster_name for use in node names, which filters +# non-alphanumeric chars and is capped at 10 chars. +slurm_cluster_name: "debiv5{{ build[0:4] }}" + +cli_deployment_vars: + instance_image: "{family: slurm-gcp-5-9-debian-11, project: schedmd-slurm-public}" + region: us-west4 + zone: us-west4-c + +zone: us-west4-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-ubuntu2004.yaml" +network: "{{ deployment_name }}-net" +max_nodes: 5 +# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work. +login_node: "{{ slurm_cluster_name }}-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-mounts.yml +- test-partitions.yml +custom_vars: + partitions: + - compute + - debug + mounts: + - /home diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml new file mode 100644 index 0000000000..ff237b3e10 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml @@ -0,0 +1,44 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: hpc-slurm-rocky8 +deployment_name: "rock-8-{{ build }}" +# Manually adding the slurm_cluster_name for use in node names, which filters +# non-alphanumeric chars and is capped at 10 chars. +slurm_cluster_name: "rock8{{ build[0:5] }}" + +cli_deployment_vars: + instance_image: "{family: slurm-gcp-5-9-hpc-rocky-linux-8, project: schedmd-slurm-public}" + region: us-west4 + zone: us-west4-c + +zone: us-west4-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-ubuntu2004.yaml" +network: "{{ deployment_name }}-net" +max_nodes: 5 +# Note: Pattern matching in gcloud only supports 1 wildcard, centv5*-login-* won't work. +login_node: "{{ slurm_cluster_name }}-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-mounts.yml +- test-partitions.yml +custom_vars: + partitions: + - compute + - debug + mounts: + - /home diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml new file mode 100644 index 0000000000..fb1f31f35a --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml @@ -0,0 +1,41 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: hpc-slurm6-rocky8 +deployment_name: "rock-8-{{ build }}" +# Manually adding the slurm_cluster_name for use in node names, which filters +# non-alphanumeric chars and is capped at 10 chars. +slurm_cluster_name: "rock8{{ build[0:5] }}" + +cli_deployment_vars: + region: us-west4 + zone: us-west4-c + +zone: us-west4-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm6.yaml" +network: "{{ deployment_name }}-net" +max_nodes: 5 +# Note: Pattern matching in gcloud only supports 1 wildcard, a*-login-* won't work. +login_node: "{{ slurm_cluster_name }}-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-partitions.yml +custom_vars: + partitions: + - compute + - debug + mounts: [] diff --git a/tools/cloud-build/project-cleanup.yaml b/tools/cloud-build/project-cleanup.yaml index 9df9123eab..1057d4b8fc 100644 --- a/tools/cloud-build/project-cleanup.yaml +++ b/tools/cloud-build/project-cleanup.yaml @@ -26,3 +26,4 @@ steps: set -e /workspace/tools/clean-resource-policies.sh /workspace/tools/clean-filestore-limit.sh + /worksapce/tools/clean-metadata.sh diff --git a/tools/cloud-build/provision/README.md b/tools/cloud-build/provision/README.md index 0041cf6496..400782e8dd 100644 --- a/tools/cloud-build/provision/README.md +++ b/tools/cloud-build/provision/README.md @@ -47,7 +47,7 @@ When prompted for project, use integration test project. |------|------| | [google_cloudbuild_trigger.daily_project_cleanup](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.daily_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | -| [google_cloudbuild_trigger.pr_go_1_18_build_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | +| [google_cloudbuild_trigger.pr_go_build_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_ofe_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_validation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | diff --git a/tools/cloud-build/provision/pr-go1-18-build-test.tf b/tools/cloud-build/provision/pr-go-build-test.tf similarity index 68% rename from tools/cloud-build/provision/pr-go1-18-build-test.tf rename to tools/cloud-build/provision/pr-go-build-test.tf index 96c4c1f864..8b85d3d5b0 100644 --- a/tools/cloud-build/provision/pr-go1-18-build-test.tf +++ b/tools/cloud-build/provision/pr-go-build-test.tf @@ -12,11 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -resource "google_cloudbuild_trigger" "pr_go_1_18_build_test" { - name = "PR-Go-1-18-build-test" - description = "Test that the PR builds with Go 1.18" - filename = "tools/cloud-build/build-test-go1-18.yaml" +resource "google_cloudbuild_trigger" "pr_go_build_test" { + for_each = toset(["1.20", "1.21"]) + + name = "PR-Go-${replace(each.key, ".", "-")}-build-test" + description = "Test that the PR builds with Go ${each.key}" + + build { + step { + name = "golang:${each.key}" + entrypoint = "/bin/bash" + args = ["-c", "make ghpc test-engine"] + } + } github { owner = "GoogleCloudPlatform" diff --git a/tools/cloud-build/quota-check/README.md b/tools/cloud-build/quota-check/README.md new file mode 100644 index 0000000000..0571fd8bec --- /dev/null +++ b/tools/cloud-build/quota-check/README.md @@ -0,0 +1,13 @@ +# `quota-check` tool + +`quota-check` is a tool to verify that GCP project has enough quota across multiple regions and zones. + +## Usage + +* Configure desired amount of resource quotas in `bp.yaml`; +* Configure set of regions and zones in `check.py`; +* Run the tool: + +```sh +tools/cloud-build/quota-check/check.py --project= +``` diff --git a/tools/cloud-build/quota-check/bp.yaml b/tools/cloud-build/quota-check/bp.yaml new file mode 100644 index 0000000000..9f34b1dd1b --- /dev/null +++ b/tools/cloud-build/quota-check/bp.yaml @@ -0,0 +1,36 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +blueprint_name: quota-check + +vars: + deployment_name: quota-check + project_id: # Provided by check.py + region: # Provided by check.py + zone: # Provided by check.py + +# Need to have at least one module to have a valid blueprint +deployment_groups: +- group: noop + modules: + - id: noop + source: modules/network/pre-existing-vpc + +validators: +- validator: test_resource_requirements + inputs: + requirements: + - metric: compute.googleapis.com/c2_cpus + required: 200 diff --git a/tools/cloud-build/quota-check/check.py b/tools/cloud-build/quota-check/check.py new file mode 100755 index 0000000000..ba634612f6 --- /dev/null +++ b/tools/cloud-build/quota-check/check.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import subprocess +from subprocess import CalledProcessError +from typing import List + +DESCRIPTION = """ +quota-check is a tool to verify that GCP project has enough quota across multiple regions and zones. +Usage: +tools/cloud-build/quota-check/check.py --project= +""" + +LOCATIONS = { + "us-central1": ["a", "c"], + "us-central2": ["a", "b"], +} + + +def _run_ghpc(args: List[str]) -> None: + subprocess.run(["./ghpc " + " ".join(args)], shell=True, check=True, capture_output=True) + +def _process_ghpc_output(serr: str) -> None: + for l in serr.splitlines(): + if l.startswith("not enough quota"): + print(l) + +def _check_zone(project: str, region: str, zone: str) -> None: + print(f"Checking {region=} {zone=}", end=" ") + try: + _run_ghpc([ + "expand", "tools/cloud-build/quota-check/bp.yaml", + "-l ERROR", # so validation will cause failure + "--skip-validators='test_deployment_variable_not_used'", # this validator is false-positive and irrelevant + f"--vars='project_id={project},{region=},{zone=}'", + ]) + except CalledProcessError as e: + print("FAIL") + _process_ghpc_output(e.stderr.decode("utf-8")) + else: + print("OK") + + +def main() -> None: + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument("--project", help="The project ID.") + + args = parser.parse_args() + assert args.project, DESCRIPTION + + + try: + _run_ghpc(["--version"]) # Smoke test + except CalledProcessError as e: + print(e.stderr.decode("utf-8")) + exit(e.returncode) + + for region, suffixes in LOCATIONS.items(): + for suffix in suffixes: + zone = f"{region}-{suffix}" + _check_zone(args.project, region, zone) + +if __name__ == '__main__': + main() diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index bfd92cb0d7..8de86cdea5 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -41,16 +41,24 @@ "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf", + "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf", + "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf", + "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf", ], [ "community/modules/compute/gke-node-pool/threads_per_core_calc.tf", "modules/compute/vm-instance/threads_per_core_calc.tf", ], - [ + [ # Slurm V5 "community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf", ], + [ # Slurm V6 + "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf", + "community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf", + "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf", + ], [ "community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl", "community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl", @@ -66,4 +74,4 @@ for second in group[1:]: if not filecmp.cmp(first, second): # true if files are the same print(f'found diff between {first} and {second}') - sys.exit(1) + sys.exit(1) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index d941c2f5c6..0041e709ee 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.83.0" + version = "~> 4.84.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.83.0" + version = "~> 4.84.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index d941c2f5c6..0041e709ee 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.83.0" + version = "~> 4.84.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.83.0" + version = "~> 4.84.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index d941c2f5c6..0041e709ee 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.83.0" + version = "~> 4.84.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.83.0" + version = "~> 4.84.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index d941c2f5c6..0041e709ee 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.83.0" + version = "~> 4.84.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.83.0" + version = "~> 4.84.0" } } } diff --git a/tools/validate_configs/test_configs/remote-desktop.yaml b/tools/validate_configs/test_configs/remote-desktop.yaml index 055c613e76..66e21fb5a8 100644 --- a/tools/validate_configs/test_configs/remote-desktop.yaml +++ b/tools/validate_configs/test_configs/remote-desktop.yaml @@ -34,7 +34,10 @@ deployment_groups: source: community/modules/remote-desktop/chrome-remote-desktop use: [network1] settings: - name_prefix: crd-default + name_prefix: crd-ubuntu + instance_image: + family: ubuntu-2204-lts + project: ubuntu-os-cloud # This module makes terraform wait until the remote-desktop setup is complete - id: wait-ubuntu @@ -45,10 +48,7 @@ deployment_groups: source: community/modules/remote-desktop/chrome-remote-desktop use: [network1] settings: - name_prefix: crd-debian - instance_image: - family: debian-11 - project: debian-cloud + name_prefix: crd-default # This module makes terraform wait until the remote-desktop setup is complete - id: wait-debian diff --git a/tools/validate_configs/test_configs/vm.yaml b/tools/validate_configs/test_configs/vm.yaml index b8e194d5c7..8de1ce67e2 100644 --- a/tools/validate_configs/test_configs/vm.yaml +++ b/tools/validate_configs/test_configs/vm.yaml @@ -31,10 +31,10 @@ deployment_groups: source: modules/network/pre-existing-vpc - source: ./modules/compute/vm-instance - id: compute_instances + id: compute_instances_family use: [network1] settings: - name_prefix: client-vm + name_prefix: client-vm-family instance_count: 1 machine_type: n2-standard-2 instance_image: @@ -46,3 +46,14 @@ deployment_groups: # --source-image-project=ubuntu-os-cloud --family myubuntu --project # project: $(vars.project_id) # family: myubuntu + + - source: ./modules/compute/vm-instance + id: compute_instances_name + use: [network1] + settings: + name_prefix: client-vm-name + instance_count: 1 + machine_type: n2-standard-2 + instance_image: + project: ubuntu-os-cloud + name: ubuntu-2004-focal-v20231101