diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3b5ad735c3..eb9008bdaa 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,7 +21,7 @@ updates: labels: - dependencies - go - - chore + - release-chore schedule: interval: weekly day: monday diff --git a/.github/release.yml b/.github/release.yml index 96ba64c330..e6757b0bd2 100644 --- a/.github/release.yml +++ b/.github/release.yml @@ -22,23 +22,18 @@ changelog: authors: [] categories: - title: Key New Features 🎉 - labels: - - release-key-new-features + labels: [release-key-new-features] - title: New Modules 🧱 - labels: - - release-new-modules - - title: Module Improvements 🛠 - labels: - - release-module-improvements - - title: Improvements - labels: - - release-improvements - - title: Deprecations - labels: - - release-deprecations - - title: Version Updates - labels: - - release-version-updates + labels: [release-new-modules] + - title: Module Improvements 🔨 + labels: [release-module-improvements] + - title: Improvements 🛠 + labels: [release-improvements] + - title: Deprecations 💤 + labels: [release-deprecations] + - title: Version Updates ⏫ + labels: [release-version-updates] + - title: Bug fixes 🐞 + labels: [release-bugfix] - title: Other changes - labels: - - "*" + labels: ["*"] diff --git a/.github/workflows/close-inactive-issues.yml b/.github/workflows/close-inactive-issues.yml new file mode 100644 index 0000000000..2c29346546 --- /dev/null +++ b/.github/workflows/close-inactive-issues.yml @@ -0,0 +1,36 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Close inactive issues +on: + schedule: + - cron: "30 1 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 63cffedf1f..becedb8358 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ --- repos: - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.77.1 + rev: v1.82.0 hooks: - id: terraform_fmt - id: terraform_tflint @@ -75,12 +75,12 @@ repos: - id: go-critic args: [-disable, "#experimental,sloppyTypeAssert"] - repo: https://github.com/adrienverge/yamllint - rev: v1.29.0 + rev: v1.32.0 hooks: - id: yamllint args: [-c=.yamllint] - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.9 + rev: v0.9.12 hooks: - id: pymarkdown # Rules at https://github.com/jackdewinter/pymarkdown/tree/main/docs/rules diff --git a/Makefile b/Makefile index 51079dad71..3a307f3118 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ ifneq (,$(wildcard .git)) ## GIT DIRECTORY EXISTS GIT_TAG_VERSION=$(shell git tag --points-at HEAD) GIT_BRANCH=$(shell git branch --show-current) -GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long) +GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long --always) GIT_COMMIT_HASH=$(shell git rev-parse HEAD) GIT_INITIAL_HASH=$(shell git rev-list --max-parents=0 HEAD) endif diff --git a/cmd/create.go b/cmd/create.go index eb4af24771..4f1afd7b6f 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -22,6 +22,7 @@ import ( "fmt" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/modulewriter" + "hpc-toolkit/pkg/validators" "log" "path/filepath" "strings" @@ -124,9 +125,43 @@ func expandOrDie(path string) config.DeploymentConfig { log.Fatal(renderError(err, ctx)) } + validateMaybeDie(dc.Config, ctx) return dc } +func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) { + err := validators.Execute(bp) + if err == nil { + return + } + log.Println(renderError(err, ctx)) + + log.Println("One or more blueprint validators has failed. See messages above for suggested") + log.Println("actions. General troubleshooting guidance and instructions for configuring") + log.Println("validators are shown below.") + log.Println("") + log.Println("- https://goo.gle/hpc-toolkit-troubleshooting") + log.Println("- https://goo.gle/hpc-toolkit-validation") + log.Println("") + log.Println("Validators can be silenced or treated as warnings or errors:") + log.Println("") + log.Println("- https://goo.gle/hpc-toolkit-validation-levels") + log.Println("") + + switch bp.ValidationLevel { + case config.ValidationWarning: + { + log.Println("Validation failures were treated as a warning, continuing to create blueprint.") + log.Println("") + } + case config.ValidationError: + { + log.Fatal("validation failed due to the issues listed above") + } + } + +} + func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { pos, ok := ctx.Pos(path) for !ok && path.Parent() != nil { @@ -137,30 +172,30 @@ func findPos(path config.Path, ctx config.YamlCtx) (config.Pos, bool) { } func renderError(err error, ctx config.YamlCtx) string { - var me config.Errors - if errors.As(err, &me) { + switch te := err.(type) { + case config.Errors: var sb strings.Builder - for _, e := range me.Errors { + for _, e := range te.Errors { sb.WriteString(renderError(e, ctx)) sb.WriteString("\n") } return sb.String() - } - - var be config.BpError - if errors.As(err, &be) { - if pos, ok := findPos(be.Path, ctx); ok { - return renderRichError(be.Err, pos, ctx) + case validators.ValidatorError: + return fmt.Sprintf("validator %q failed:\n%v\n", te.Validator, renderError(te.Err, ctx)) + case config.BpError: + if pos, ok := findPos(te.Path, ctx); ok { + return renderRichError(te.Err, pos, ctx) } + return renderError(te.Err, ctx) + default: + return err.Error() } - return err.Error() } func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string { pref := fmt.Sprintf("%d: ", pos.Line) arrow := strings.Repeat(" ", len(pref)+pos.Column-1) + "^" - return fmt.Sprintf(` -Error: %s + return fmt.Sprintf(`Error: %s %s%s %s`, err, pref, ctx.Lines[pos.Line-1], arrow) } diff --git a/cmd/create_test.go b/cmd/create_test.go index 528a0eb15f..b1a2e02e06 100644 --- a/cmd/create_test.go +++ b/cmd/create_test.go @@ -136,12 +136,12 @@ func (s *MySuite) TestRenderError(c *C) { got := renderError(err, config.YamlCtx{}) c.Check(got, Equals, "arbuz") } - { // has pos, but context is missing + { // has pos, but context doesn't contain it ctx := config.NewYamlCtx([]byte(``)) pth := config.Root.Vars.Dot("kale") err := config.BpError{Path: pth, Err: errors.New("arbuz")} got := renderError(err, ctx) - c.Check(got, Equals, "vars.kale: arbuz") + c.Check(got, Equals, "arbuz") } { // has pos, has context ctx := config.NewYamlCtx([]byte(` @@ -150,9 +150,16 @@ vars: pth := config.Root.Vars.Dot("kale") err := config.BpError{Path: pth, Err: errors.New("arbuz")} got := renderError(err, ctx) - c.Check(got, Equals, ` -Error: arbuz + c.Check(got, Equals, `Error: arbuz 3: kale: dos ^`) } } + +func (s *MySuite) TestValidateMaybeDie(c *C) { + bp := config.Blueprint{ + Validators: []config.Validator{{Validator: "invalid"}}, + ValidationLevel: config.ValidationWarning, + } + validateMaybeDie(bp, config.NewYamlCtx([]byte{})) // smoke test +} diff --git a/cmd/root.go b/cmd/root.go index dc2fe589eb..b15b347ba2 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -50,7 +50,7 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.22.1", + Version: "v1.23.0", Annotations: annotation, } ) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 397ba3f600..186e73b113 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -22,6 +22,7 @@ vars: zone: us-central1-c disk_size_gb: 100 new_image_family: htcondor-10x + enable_shielded_vm: true # Documentation for each of the modules used below can be found at # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml new file mode 100644 index 0000000000..db18855352 --- /dev/null +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -0,0 +1,114 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: starccm-on-slurm + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: starccm-slurm + region: us-central1 + zone: us-central1-c + +# Documentation for each of the modules used below can be found at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + +deployment_groups: +- group: primary + modules: + # Source is an embedded resource, denoted by "resources/*" without ./, ../, / + # as a prefix. To refer to a local resource, prefix with ./, ../ or / + # Example - ./resources/network/vpc + - id: network1 + source: modules/network/vpc + + - id: homefs + source: modules/file-system/filestore + use: [network1] + settings: + local_mount: /home + + - id: login-script + kind: terraform + source: modules/scripts/startup-script + settings: + configure_ssh_host_patterns: ["star*"] + + - id: compute-script + source: modules/scripts/startup-script + settings: + configure_ssh_host_patterns: ["star*"] + runners: + - type: shell + content: | + #!/bin/bash + google_mpi_tuning --hpcthroughput + google_mpi_tuning --nomitigation + destination: /tmp/tune-mpi.sh + + - id: debug_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - debug_node_group + - compute-script + settings: + partition_name: debug + is_default: true + + - id: compute_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + bandwidth_tier: "gvnic_enabled" + disable_public_ips: false + machine_type: c2-standard-60 + node_count_dynamic_max: 20 + + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - compute_node_group + - compute-script + settings: + partition_name: compute + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + - debug_partition + - compute_partition + - homefs + settings: + disable_controller_public_ips: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + - login-script + settings: + machine_type: n2-standard-4 + disable_login_public_ips: true diff --git a/community/examples/tutorial-starccm.yaml b/community/examples/tutorial-starccm.yaml index 3ceb3a7fed..db86f35518 100644 --- a/community/examples/tutorial-starccm.yaml +++ b/community/examples/tutorial-starccm.yaml @@ -29,6 +29,7 @@ deployment_groups: kind: terraform id: startup settings: + configure_ssh_host_patterns: ["star*"] runners: - type: shell content: | diff --git a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf index c0e669f897..c5d78daa00 100644 --- a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf +++ b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf @@ -21,6 +21,7 @@ terraform { version = ">= 3.54" } random = { + source = "hashicorp/random" version = ">= 3.0" } } @@ -28,7 +29,6 @@ terraform { required_version = ">= 0.12.31" } - provider "google" { project = var.project region = var.region diff --git a/community/front-end/ofe/infrastructure_files/workbench_tf/google/versions.tf b/community/front-end/ofe/infrastructure_files/workbench_tf/google/versions.tf index 5bdbad9b22..e45c71e961 100644 --- a/community/front-end/ofe/infrastructure_files/workbench_tf/google/versions.tf +++ b/community/front-end/ofe/infrastructure_files/workbench_tf/google/versions.tf @@ -18,8 +18,17 @@ terraform { required_version = "~> 1.0" required_providers { - google = ">= 3.87.0" - random = ">= 2.0" - time = ">= 0.7.2" + google = { + source = "hashicorp/google" + version = ">= 3.87.0" + } + random = { + source = "hashicorp/random" + version = ">= 2.0" + } + time = { + source = "hashicorp/time" + version = ">= 0.7.2" + } } } diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 42f400bf52..f3bb16d205 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -2,7 +2,8 @@ archspec==0.2.1 argcomplete==3.1.1 asgiref==3.7.2 astroid==2.15.5 -backports.zoneinfo==0.2.1 +# This should be supported by zoneinfo in Python 3.9+ +backports.zoneinfo==0.2.1;python_version<"3.9" cachetools==5.3.1 certifi==2023.07.22 cffi==1.15.1 diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index 96d9624610..05f603fb3a 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -56,8 +56,16 @@ def _get_arch_for_node_type_gcp(instance): - (family, group, _) = instance.split("-") - return gcp_machine_table[family][group] + try: + logger.info(instance.split("-")) + family, group, _ = instance.split("-", maxsplit=2) + return gcp_machine_table[family][group] + except ValueError: + logger.error(f"Invalid instance format: {instance}") + return None + except KeyError: + logger.error(f"Keys not found in gcp_machine_table: {instance}") + return None def _get_gcp_client(credentials, service="compute", api_version="v1"): diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/list.html b/community/front-end/ofe/website/ghpcfe/templates/image/list.html index 1b1b25e1fb..3a23e4dce2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/image/list.html +++ b/community/front-end/ofe/website/ghpcfe/templates/image/list.html @@ -106,7 +106,7 @@

Images

View - {% if admin_view == 1 and image.status == "r" or image.status = "e" %} + {% if admin_view == 1 and image.status == "r" or image.status == "e" %} {% endif %} diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index c84178ca2e..e19ab6165c 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.23.0" } } diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 5bfd4dfcf9..b7052bb071 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -196,6 +196,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | | [execute\_point\_runner](#input\_execute\_point\_runner) | A list of Toolkit runners for configuring an HTCondor execute point | `list(map(string))` | `[]` | no | | [execute\_point\_service\_account\_email](#input\_execute\_point\_service\_account\_email) | Service account for HTCondor execute point (e-mail format) | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | @@ -212,6 +213,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project in which the HTCondor execute points will be created | `string` | n/a | yes | | [region](#input\_region) | The region in which HTCondor execute points will be created | `string` | n/a | yes | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 45f3b4891b..c1ac1a81f1 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -69,6 +69,27 @@ locals { ]) } + native_fstype = [] + startup_script_network_storage = [ + for ns in var.network_storage : + ns if !contains(local.native_fstype, ns.fs_type) + ] + storage_client_install_runners = [ + for ns in local.startup_script_network_storage : + ns.client_install_runner if ns.client_install_runner != null + ] + mount_runners = [ + for ns in local.startup_script_network_storage : + ns.mount_runner if ns.mount_runner != null + ] + + all_runners = concat( + local.storage_client_install_runners, + local.mount_runners, + var.execute_point_runner, + [local.execute_runner], + ) + execute_config_windows_startup_ps1 = templatefile( "${path.module}/templates/download-condor-config.ps1.tftpl", { @@ -110,7 +131,7 @@ module "startup_script" { labels = local.labels deployment_name = var.deployment_name - runners = flatten([var.execute_point_runner, local.execute_runner]) + runners = local.all_runners } module "execute_point_instance_template" { @@ -134,6 +155,10 @@ module "execute_point_instance_template" { startup_script = local.is_windows_image ? null : module.startup_script.startup_script metadata = local.metadata source_image = data.google_compute_image.htcondor.self_link + + # secure boot + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config } module "mig" { diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index b21dbf5424..d28d77eadf 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -201,3 +201,24 @@ variable "name_prefix" { error_message = "var.name_prefix must be a set to a non-empty string and must also be unique across all instances of htcondor-execute-point" } } + +variable "enable_shielded_vm" { + type = bool + default = false + description = "Enable the Shielded VM configuration (var.shielded_instance_config)." +} + +variable "shielded_instance_config" { + description = "Shielded VM configuration for the instance (must set var.enabled_shielded_vm)" + type = object({ + enable_secure_boot = bool + enable_vtpm = bool + enable_integrity_monitoring = bool + }) + + default = { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } +} diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 1cd0afb55f..510ca260c9 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.23.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 0e93805a3b..a1070949d8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 variable "project_id" { description = "Project in which the HPC deployment will be created." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 37df912274..6dd182b7d2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.23.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index 92ceb8e686..a6c3c787df 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.5 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.6 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 889cfb4899..ae5ece8192 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.6" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index 0733a68d96..167a67663c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index c952ee209e..0e5985d9cc 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.5 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.6 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index e3a2179ba6..fa5fb35659 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -38,7 +38,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.6" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 628d2e7f79..ce55637560 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 1f51ba0b9a..8385a3e851 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.23.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 8d35b0317d..c9ce4c81d2 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.23.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.23.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index cf2059427f..28b94f4bd9 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.23.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 4f80217e08..e1eff55c38 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.23.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 10f3adc7b1..33d42a9a1f 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.23.0" } required_version = ">= 0.14.0" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 5f8c9f18b6..06384fe78d 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.23.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf index 0a9090ac16..5f6b92545d 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.23.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf index d68381c6a4..090a7e3bff 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.23.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 6659641f75..f8144de3b1 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.23.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index f4458ace78..c63decb33f 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -91,6 +91,7 @@ limitations under the License. | [enable\_high\_availability](#input\_enable\_high\_availability) | Provision HTCondor access point in high availability mode | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | Enable Public IPs on the access points | `bool` | `false` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | | [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | | [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor and Toolkit support installed. |
object({
family = string,
project = string
})
| n/a | yes | | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | @@ -102,6 +103,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | | [zones](#input\_zones) | Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 562113a136..dc1d7318bf 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -47,7 +47,23 @@ locals { EOT } + native_fstype = [] + startup_script_network_storage = [ + for ns in var.network_storage : + ns if !contains(local.native_fstype, ns.fs_type) + ] + storage_client_install_runners = [ + for ns in local.startup_script_network_storage : + ns.client_install_runner if ns.client_install_runner != null + ] + mount_runners = [ + for ns in local.startup_script_network_storage : + ns.mount_runner if ns.mount_runner != null + ] + all_runners = concat( + local.storage_client_install_runners, + local.mount_runners, var.access_point_runner, [local.schedd_runner], var.autoscaler_runner, @@ -156,6 +172,10 @@ module "access_point_instance_template" { startup_script = module.startup_script.startup_script metadata = local.metadata source_image = data.google_compute_image.htcondor.self_link + + # secure boot + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config } module "htcondor_ap" { diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 2ba01eefbe..3d9710459b 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -181,3 +181,24 @@ variable "default_mig_id" { default = "" nullable = false } + +variable "enable_shielded_vm" { + type = bool + default = false + description = "Enable the Shielded VM configuration (var.shielded_instance_config)." +} + +variable "shielded_instance_config" { + description = "Shielded VM configuration for the instance (must set var.enabled_shielded_vm)" + type = object({ + enable_secure_boot = bool + enable_vtpm = bool + enable_integrity_monitoring = bool + }) + + default = { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } +} diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index ff86b0360b..4475102005 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.23.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index d138ed2b98..a946b0c985 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -122,6 +122,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape for instance group managing high availability of central manager | `string` | `"BALANCED"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | | [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | | [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor installed using the htcondor-install module. |
object({
family = string,
project = string
})
| n/a | yes | | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | @@ -132,6 +133,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project in which HTCondor central manager will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | | [zones](#input\_zones) | Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index f9e07535be..f34890416b 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -41,7 +41,27 @@ locals { "-e config_object=${local.cm_object}", ]) } - all_runners = flatten([var.central_manager_runner, local.schedd_runner]) + + native_fstype = [] + startup_script_network_storage = [ + for ns in var.network_storage : + ns if !contains(local.native_fstype, ns.fs_type) + ] + storage_client_install_runners = [ + for ns in local.startup_script_network_storage : + ns.client_install_runner if ns.client_install_runner != null + ] + mount_runners = [ + for ns in local.startup_script_network_storage : + ns.mount_runner if ns.mount_runner != null + ] + + all_runners = concat( + local.storage_client_install_runners, + local.mount_runners, + var.central_manager_runner, + [local.schedd_runner] + ) central_manager_ips = [data.google_compute_instance.cm.network_interface[0].network_ip] central_manager_name = data.google_compute_instance.cm.name @@ -119,6 +139,10 @@ module "central_manager_instance_template" { startup_script = module.startup_script.startup_script metadata = local.metadata source_image = data.google_compute_image.htcondor.self_link + + # secure boot + enable_shielded_vm = var.enable_shielded_vm + shielded_instance_config = var.shielded_instance_config } module "htcondor_cm" { diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 8ea3f605fb..6750f97eba 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -134,3 +134,24 @@ variable "htcondor_bucket_name" { description = "Name of HTCondor configuration bucket" type = string } + +variable "enable_shielded_vm" { + type = bool + default = false + description = "Enable the Shielded VM configuration (var.shielded_instance_config)." +} + +variable "shielded_instance_config" { + description = "Shielded VM configuration for the instance (must set var.enabled_shielded_vm)" + type = object({ + enable_secure_boot = bool + enable_vtpm = bool + enable_integrity_monitoring = bool + }) + + default = { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } +} diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 2be15d5a9a..16b6d0ecb8 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.23.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 9e86b7175c..9f05d8ba4c 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.23.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 0d39b2386c..b31c017ffe 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.23.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 0b06026a76..c760befa1b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.6/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 +[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/scripts/requirements.txt +[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.6/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster#optional +[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster#optional ## Custom Images @@ -163,8 +163,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.7.5 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.5 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.7.6 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.6 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index ce5b4e6af8..0df07924e7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -54,7 +54,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.7.6" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -90,7 +90,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.6" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 9ee056ea84..36bcfc3c2e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 934f813440..d06d410a1a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.23.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 22b8f0ab82..1660c4cea7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/scripts/requirements.txt +[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md -[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/docs/hybrid.md +[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer +[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/packer ## License @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.7.5 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.7.6 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 6d73c71fe9..fcfcdcf25e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.7.6" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 9b72d416ad..88acea06ca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 +[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -49,8 +49,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 +[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6#slurm-on-google-cloud-platform ## License @@ -85,8 +85,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.7.5 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.5 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.7.6 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.6 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index d4d9e849c1..b7e2f5e208 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.6" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.7.5" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.7.6" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index fa7cfb8135..e062a210cd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 variable "project_id" { type = string diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index c506d414fe..4d83e1e6c8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.23.0" } required_version = ">= 1.1" } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 5dd60e2b62..09ba703f44 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.23.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index b8a6d4d1a2..65f5a12a12 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.23.0" } required_version = ">= 0.14.0" diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 8d4dae6ca9..8c84279450 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 4426b24433..8f08adfab7 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -260,8 +260,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 -[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 +[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index a07ef8fc49..7ed917b810 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6 [slurm\_controller\_hybrid]: https://github.com/SchedMD/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md +[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/docs/hybrid.md ### NFS Mounts @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer -[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/packer +[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.7.6/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.6/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 1d80b7c9a3..a3d8ed5a1a 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/SchedMD/slurm-gcp//packer?ref=5.7.5&depth=1 + source: github.com/SchedMD/slurm-gcp//packer?ref=5.7.6&depth=1 kind: packer settings: use_iap: true diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md index 8b7cf5d34d..2705112de3 100644 --- a/docs/tutorials/README.md +++ b/docs/tutorials/README.md @@ -72,3 +72,9 @@ graph TB A --> G F --> G ``` + +### Qwiklabs Tutorial + +The [hpc-slurm-qwiklabs.yaml](hpc-slurm-qwiklabs.yaml) blueprint is meant for +use in Qwiklabs tutorials and uses machine types that are compatible with +Qwiklabs. diff --git a/docs/tutorials/hpc-slurm-qwiklabs.yaml b/docs/tutorials/hpc-slurm-qwiklabs.yaml new file mode 100644 index 0000000000..6034345636 --- /dev/null +++ b/docs/tutorials/hpc-slurm-qwiklabs.yaml @@ -0,0 +1,85 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: hpc-slurm-qwiklab + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: hpc-qwiklab + region: us-central1 + zone: us-central1-a + +# Documentation for each of the modules used below can be found at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + +deployment_groups: +- group: primary + modules: + # Source is an embedded resource, denoted by "resources/*" without ./, ../, / + # as a prefix. To refer to a local resource, prefix with ./, ../ or / + # Example - ./resources/network/vpc + - id: network1 + source: modules/network/vpc + + - id: debug_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 4 + machine_type: n2d-standard-2 + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - debug_node_group + settings: + partition_name: debug + exclusive: false # allows nodes to stay up after jobs are done + enable_placement: false # the default is: true + is_default: true + + - id: compute_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 20 + machine_type: c2-standard-30 + + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - compute_node_group + settings: + partition_name: compute + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + - debug_partition + - compute_partition + settings: + machine_type: n2d-standard-4 + disable_controller_public_ips: false + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + settings: + machine_type: n2d-standard-4 + disable_login_public_ips: false diff --git a/examples/README.md b/examples/README.md index 9fdb0004a4..9e026cfd77 100644 --- a/examples/README.md +++ b/examples/README.md @@ -33,6 +33,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge] * [htc-slurm.yaml](#htc-slurmyaml--) ![community-badge] ![experimental-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] + * [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge] * [tutorial-starccm.yaml](#tutorial-starccmyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-ramble-gromacs.yaml](#hpc-slurm-ramble-gromacsyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-chromedesktop.yaml](#hpc-slurm-chromedesktopyaml--) ![community-badge] ![experimental-badge] @@ -120,7 +121,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.6/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -257,7 +258,7 @@ to 256 ### [ml-slurm.yaml] ![core-badge] This blueprint provisions an HPC cluster running the Slurm scheduler with the -machine learning frameworks [PyTorch] and [TensorFlow] pre-installed on every +machine learning frameworks PyTorch and TensorFlow pre-installed on every VM. The cluster has 2 partitions: * [A2 family VMs][a2] with the NVIDIA A100 GPU accelerator @@ -294,8 +295,7 @@ sbatch -N 1 torch_test.sh When you are done, clean up the resources in reverse order of creation: ```text -terraform -chdir=ml-example/cluster destroy -terraform -chdir=ml-example/primary destroy +./ghpc destroy ml-example ``` Finally, browse to the [Cloud Console][console-images] to delete your custom @@ -535,7 +535,7 @@ For this example the following is needed in the selected region: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.6/scripts/requirements.txt > ``` Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. @@ -864,6 +864,15 @@ For more information see: [htc-slurm.yaml]: ../community/examples/htc-slurm.yaml +### [tutorial-starccm-slurm.yaml] ![community-badge] ![experimental-badge] + +This blueprint provisions an HPC cluster running Slurm for use with a Simcenter StarCCM+ +tutorial. + +> The main tutorial is described on the [HPC Toolkit website](https://cloud.google.com/hpc-toolkit/docs/simcenter-starccm-slurm/run-workload). + +[tutorial-starccm-slurm.yaml]: ../community/examples/tutorial-starccm-slurm.yaml + ### [tutorial-starccm.yaml] ![community-badge] ![experimental-badge] This blueprint provisions a simple cluster for use with a Simcenter StarCCM+ diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index bf0a90f3e7..a54e498d15 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -89,7 +89,7 @@ deployment_groups: use: [network1] settings: filestore_tier: BASIC_SSD - size_gb: 2560 # smallest size for BASIC_SSD + size_gb: 2560 # smallest size for BASIC_SSD local_mount: /home - id: projectsfs @@ -97,7 +97,7 @@ deployment_groups: use: [network1] settings: filestore_tier: HIGH_SCALE_SSD - size_gb: 10240 # smallest size for HIGH_SCALE_SSD + size_gb: 10240 # smallest size for HIGH_SCALE_SSD local_mount: /projects # This file system has an associated license cost. @@ -126,15 +126,17 @@ deployment_groups: use: [n2_node_group, network1, homefs, projectsfs, scratchfs] settings: partition_name: n2 - exclusive: false # allows nodes to stay up after jobs are done - enable_placement: false # the default is: true + exclusive: false # allows nodes to stay up after jobs are done + enable_placement: false # the default is: true is_default: true + partition_conf: + SuspendTime: 300 # time (in secs) the nodes in this partition stay active after their tasks have completed - id: c2_node_group source: community/modules/compute/schedmd-slurm-gcp-v5-node-group settings: node_count_dynamic_max: 20 - machine_type: c2-standard-60 # this is the default + machine_type: c2-standard-60 # this is the default instance_image: family: $(vars.family) project: $(vars.project) @@ -154,7 +156,7 @@ deployment_groups: settings: partition_name: c2 # the following two are true by default - exclusive: true # this must be true if enable_placement is true + exclusive: true # this must be true if enable_placement is true enable_placement: true - id: c2d_node_group @@ -212,6 +214,9 @@ deployment_groups: project: $(vars.project) disk_type: pd-ssd disk_size_gb: 100 + node_conf: + Sockets: 2 + CoresPerSocket: 24 service_account: email: $(compute_sa.service_account_email) scopes: @@ -227,6 +232,10 @@ deployment_groups: # This makes this partition look for machines in any of the following zones # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies zones: $(vars.gpu_zones) + # The following allows users to use more host memory without specifying cpus on a job + partition_conf: + DefMemPerGPU: 160000 + DefMemPerCPU: null - id: a2_16_node_group source: community/modules/compute/schedmd-slurm-gcp-v5-node-group @@ -239,6 +248,9 @@ deployment_groups: project: $(vars.project) disk_type: pd-ssd disk_size_gb: 100 + node_conf: + Sockets: 2 + CoresPerSocket: 24 service_account: email: $(compute_sa.service_account_email) scopes: @@ -254,13 +266,17 @@ deployment_groups: # This makes this partition look for machines in any of the following zones # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies zones: $(vars.gpu_zones) + # The following allows users to use more host memory without specifying cpus on a job + partition_conf: + DefMemPerGPU: 160000 + DefMemPerCPU: null - id: h3_node_group source: community/modules/compute/schedmd-slurm-gcp-v5-node-group settings: node_count_dynamic_max: 16 machine_type: h3-standard-88 - bandwidth_tier: gvnic_enabled # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_network + bandwidth_tier: gvnic_enabled # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_network instance_image: family: $(vars.family) project: $(vars.project) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index f2dfe6ca9a..23f732832e 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -134,7 +134,7 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: schedmd-v5-slurm-22-05-9-debian-11 + source_image_family: slurm-gcp-5-7-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/go.mod b/go.mod index 490658af95..9171c555ff 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module hpc-toolkit go 1.18 require ( - cloud.google.com/go/compute v1.20.1 // indirect + cloud.google.com/go/compute v1.23.0 // indirect cloud.google.com/go/storage v1.30.1 // indirect github.com/go-git/go-git/v5 v5.8.1 github.com/hashicorp/go-getter v1.7.2 @@ -16,7 +16,7 @@ require ( github.com/spf13/cobra v1.7.0 github.com/zclconf/go-cty v1.13.2 golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a - google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130 // indirect + google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -26,7 +26,7 @@ require ( github.com/google/go-cmp v0.5.9 github.com/hashicorp/terraform-exec v0.18.1 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.134.0 + google.golang.org/api v0.138.0 ) require ( @@ -35,19 +35,20 @@ require ( github.com/hashicorp/terraform-json v0.15.0 // indirect github.com/rogpeppe/go-internal v1.9.0 // indirect golang.org/x/mod v0.9.0 // indirect + golang.org/x/sync v0.3.0 // indirect golang.org/x/tools v0.6.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 // indirect ) require ( - cloud.google.com/go v0.110.4 // indirect + cloud.google.com/go v0.110.6 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect - cloud.google.com/go/iam v1.1.0 // indirect + cloud.google.com/go/iam v1.1.1 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95 // indirect github.com/acomagu/bufpipe v1.0.4 // indirect - github.com/agext/levenshtein v1.2.2 // indirect + github.com/agext/levenshtein v1.2.3 github.com/apparentlymart/go-textseg/v13 v13.0.0 // indirect github.com/aws/aws-sdk-go v1.44.122 // indirect github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect @@ -56,7 +57,7 @@ require ( github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect - github.com/google/s2a-go v0.1.4 // indirect + github.com/google/s2a-go v0.1.5 // indirect github.com/google/uuid v1.3.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect @@ -80,14 +81,14 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.11.0 // indirect - golang.org/x/net v0.12.0 // indirect - golang.org/x/oauth2 v0.10.0 // indirect + golang.org/x/crypto v0.12.0 // indirect + golang.org/x/net v0.14.0 // indirect + golang.org/x/oauth2 v0.11.0 // indirect golang.org/x/sys v0.11.0 - golang.org/x/text v0.11.0 // indirect + golang.org/x/text v0.12.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/grpc v1.56.2 // indirect + google.golang.org/grpc v1.57.0 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 808e6a9910..fa14e164c3 100644 --- a/go.sum +++ b/go.sum @@ -32,8 +32,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.110.4 h1:1JYyxKMN9hd5dR2MYTPWkGUgcoxVVhg0LKNKEo0qvmk= -cloud.google.com/go v0.110.4/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= +cloud.google.com/go v0.110.6 h1:8uYAkj3YHTP/1iwReuHPxLSbdcyc+dSBbzFMrVwDR6Q= +cloud.google.com/go v0.110.6/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -70,8 +70,8 @@ cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= -cloud.google.com/go/compute v1.20.1 h1:6aKEtlUiwEpJzM001l0yFkpXmUVXaN8W+fbkb2AZNbg= -cloud.google.com/go/compute v1.20.1/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM= +cloud.google.com/go/compute v1.23.0 h1:tP41Zoavr8ptEqaW6j+LQOnyBBhO7OkOMAGrgLopTwY= +cloud.google.com/go/compute v1.23.0/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM= cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= cloud.google.com/go/containeranalysis v0.5.1/go.mod h1:1D92jd8gRR/c0fGMlymRgxWD3Qw9C1ff6/T7mLgVL8I= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.0 h1:67gSqaPukx7O8WLLHMa0PNs3EBGd2eE4d+psbO/CO94= -cloud.google.com/go/iam v1.1.0/go.mod h1:nxdHjaKfCr7fNYx/HJMM8LgiMugmveWlkatear5gVyk= +cloud.google.com/go/iam v1.1.1 h1:lW7fzj15aVIXYHREOqjRBV9PsH0Z6u8Y46a1YGvQP4Y= +cloud.google.com/go/iam v1.1.1/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -200,8 +200,8 @@ github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95 h1:KLq8BE0KwC github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0= github.com/acomagu/bufpipe v1.0.4 h1:e3H4WUzM3npvo5uv95QuJM3cQspFNtFBzvJ2oNjKIDQ= github.com/acomagu/bufpipe v1.0.4/go.mod h1:mxdxdup/WdsKVreO5GpW4+M/1CE2sMG4jeGJ2sYmHc4= -github.com/agext/levenshtein v1.2.2 h1:0S/Yg6LYmFJ5stwQeRp6EeOcCbj7xiqQSdNelsXvaqE= -github.com/agext/levenshtein v1.2.2/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558= +github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo= +github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/apparentlymart/go-textseg v1.0.0/go.mod h1:z96Txxhf3xSFMPmb5X/1W05FF/Nj9VFpLOpjS5yuumk= @@ -340,8 +340,8 @@ github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/s2a-go v0.1.4 h1:1kZ/sQM3srePvKs3tXAvQzo66XfcReoqFpIpIccE7Oc= -github.com/google/s2a-go v0.1.4/go.mod h1:Ej+mSEMGRnqRzjc7VtF+jdBwYG5fuJfiZ8ELkjEwM0A= +github.com/google/s2a-go v0.1.5 h1:8IYp3w9nysqv3JH+NJgXJzGbDHzLOTj43BmSkp+O7qg= +github.com/google/s2a-go v0.1.5/go.mod h1:Ej+mSEMGRnqRzjc7VtF+jdBwYG5fuJfiZ8ELkjEwM0A= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -512,8 +512,8 @@ golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= -golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= -golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/crypto v0.12.0 h1:tFM/ta59kqch6LlvYnPa0yx5a83cL2nHflFhYKvv9Yk= +golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -608,8 +608,8 @@ golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= -golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= +golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -635,8 +635,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= -golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= +golang.org/x/oauth2 v0.11.0 h1:vPL4xzxBM4niKCW6g9whtaWVXTJf1U5e4aZxxFx/gbU= +golang.org/x/oauth2 v0.11.0/go.mod h1:LdF7O/8bLR/qWK9DrpXmbHLTouvRHK0SgJl0GmDBchk= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -653,6 +653,7 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -733,7 +734,7 @@ golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= -golang.org/x/term v0.10.0 h1:3R7pNqamzBraeqj/Tj8qt1aQ2HpmlC+Cx/qL/7hn4/c= +golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -747,8 +748,8 @@ golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= -golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -864,8 +865,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.134.0 h1:ktL4Goua+UBgoP1eL1/60LwZJqa1sIzkLmvoR3hR6Gw= -google.golang.org/api v0.134.0/go.mod h1:sjRL3UnjTx5UqNQS9EWr9N8p7xbHpy1k0XGRLCf3Spk= +google.golang.org/api v0.138.0 h1:K/tVp05MxNVbHShRw9m7e9VJGdagNeTdMzqPH7AUqr0= +google.golang.org/api v0.138.0/go.mod h1:4xyob8CxC+0GChNBvEUAk8VBKNvYOTWM9T3v3UfRxuY= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -977,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130 h1:Au6te5hbKUV8pIYWHqOUZ1pva5qK/rwbIhoXEUB9Lu8= -google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:O9kGHb51iE/nOGvQaDUuadVYqovW56s5emA88lQnj6Y= -google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 h1:XVeBY8d/FaK4848myy41HBqnDwvxeV3zMZhwN1TvAMU= -google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:mPBs5jNgx2GuQGvFwUvVKqtn6HsUw9nP64BedgvqEsQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771 h1:Z8qdAF9GFsmcUuWQ5KVYIpP3PCKydn/YKORnghIalu4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM= +google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 h1:L6iMMGrtzgHsWofoFcihmDEMYeDR9KN/ThbPWGrh++g= +google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5/go.mod h1:oH/ZOT02u4kWEp7oYBGYFFkCdKS/uYR9Z7+0/xuuFp8= +google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 h1:nIgk/EEq3/YlnmVVXVnm14rC2oxgs1o0ong4sD/rd44= +google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5/go.mod h1:5DZzOUPCLYL3mNkQ0ms0F3EuUNZ7py1Bqeq6sxzI7/Q= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577 h1:wukfNtZmZUurLN/atp2hiIeTKn7QJWIQdHzqmsOnAOk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230807174057-1744710a1577/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1018,8 +1019,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.56.2 h1:fVRFRnXvU+x6C4IlHZewvJOVHoOv1TUuQyoRsYnB4bI= -google.golang.org/grpc v1.56.2/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= +google.golang.org/grpc v1.57.0 h1:kfzNeI/klCGD2YPMUlaGNT3pxvYfga7smW3Vth8Zsiw= +google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index f2765f67af..9eb352de18 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -237,6 +237,13 @@ resource "google_compute_instance" "compute_vm" { condition = (length(var.network_interfaces) == 0) != (var.network_self_link == null && var.subnetwork_self_link == null) error_message = "Exactly one of network_interfaces or network_self_link/subnetwork_self_link must be specified." } + precondition { + condition = alltrue([for interface in var.network_interfaces : interface.network_ip == null]) || var.instance_count == 1 + error_message = <<-EOT + The network_ip cannot be statically set on vm-instance when the VM instance_count is greater than 1. + Either set the network_ip to null to allow it to be set dynamically for all instances, or create modules for each VM instance with its own network interface. + EOT + } precondition { condition = !contains([ "c3-:pd-standard", diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index f1b8b42396..f6d1e52825 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.23.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.23.0" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index 74383a5252..482858df8b 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -106,7 +106,7 @@ install the client and mount the file system. See the following example: ``` -[matrix]: ../../../../docs/network_storage.md#compatibility-matrix +[matrix]: ../../../docs/network_storage.md#compatibility-matrix ## License @@ -156,7 +156,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [connect\_mode](#input\_connect\_mode) | Used to select mode - supported values DIRECT\_PEERING and PRIVATE\_SERVICE\_ACCESS. | `string` | `"DIRECT_PEERING"` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the filestore instace if no name is specified. | `string` | n/a | yes | +| [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the filestore instance if no name is specified. | `string` | n/a | yes | | [filestore\_share\_name](#input\_filestore\_share\_name) | Name of the file system share on the instance. | `string` | `"nfsshare"` | no | | [filestore\_tier](#input\_filestore\_tier) | The service tier of the instance. | `string` | `"BASIC_HDD"` | no | | [labels](#input\_labels) | Labels to add to the filestore instance. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/modules/file-system/filestore/variables.tf b/modules/file-system/filestore/variables.tf index 1d877cbb06..e54847f1af 100644 --- a/modules/file-system/filestore/variables.tf +++ b/modules/file-system/filestore/variables.tf @@ -20,7 +20,7 @@ variable "project_id" { } variable "deployment_name" { - description = "Name of the HPC deployment, used as name of the filestore instace if no name is specified." + description = "Name of the HPC deployment, used as name of the filestore instance if no name is specified." type = string } diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index c055e49fd6..9fc00e961b 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.23.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.23.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 47832e38f5..7ae0e755c2 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.23.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 35d23ca4d9..a6ecb3daed 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.23.0" } required_version = ">= 0.14.0" diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index dbaf1323f7..86f1251e8b 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -245,6 +245,7 @@ No resources. | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name | `string` | n/a | yes | | [disk\_size](#input\_disk\_size) | Size of disk image in GB | `number` | `null` | no | | [disk\_type](#input\_disk\_type) | Type of persistent disk to provision | `string` | `"pd-balanced"` | no | +| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration (var.shielded\_instance\_config). | `bool` | `false` | no | | [image\_family](#input\_image\_family) | The family name of the image to be built. Defaults to `deployment_name` | `string` | `null` | no | | [image\_name](#input\_image\_name) | The name of the image to be built. If not supplied, it will be set to image\_family-$ISO\_TIMESTAMP | `string` | `null` | no | | [image\_storage\_locations](#input\_image\_storage\_locations) | Storage location, either regional or multi-regional, where snapshot content is to be stored and only accepts 1 value.
See https://developer.hashicorp.com/packer/plugins/builders/googlecompute#image_storage_locations | `list(string)` | `null` | no | @@ -259,6 +260,7 @@ No resources. | [scopes](#input\_scopes) | Service account scopes to attach to the instance. See
https://cloud.google.com/compute/docs/access/service-accounts. | `list(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [service\_account\_email](#input\_service\_account\_email) | The service account email to use. If null or 'default', then the default Compute Engine service account will be used. | `string` | `null` | no | | [shell\_scripts](#input\_shell\_scripts) | A list of paths to local shell scripts which will be uploaded to customize the VM image | `list(string)` | `[]` | no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [source\_image](#input\_source\_image) | Source OS image to build from | `string` | `null` | no | | [source\_image\_family](#input\_source\_image\_family) | Alternative to source\_image. Specify image family to build from latest image in family | `string` | `"hpc-centos-7"` | no | | [source\_image\_project\_id](#input\_source\_image\_project\_id) | A list of project IDs to search for the source image. Packer will search the
first project ID in the list first, and fall back to the next in the list,
until it finds the source image. | `list(string)` | `null` | no | diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index c37ff0b941..d2930b08b9 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -78,42 +78,49 @@ locals { winrm_username = local.communicator == "winrm" ? "packer_user" : null winrm_insecure = local.communicator == "winrm" ? true : null winrm_use_ssl = local.communicator == "winrm" ? true : null + + enable_integrity_monitoring = var.enable_shielded_vm && var.shielded_instance_config.enable_integrity_monitoring + enable_secure_boot = var.enable_shielded_vm && var.shielded_instance_config.enable_secure_boot + enable_vtpm = var.enable_shielded_vm && var.shielded_instance_config.enable_vtpm } source "googlecompute" "toolkit_image" { - communicator = local.communicator - project_id = var.project_id - image_name = local.image_name - image_family = local.image_family - image_labels = var.labels - machine_type = var.machine_type - accelerator_type = local.accelerator_type - accelerator_count = var.accelerator_count - on_host_maintenance = local.on_host_maintenance - disk_size = var.disk_size - disk_type = var.disk_type - omit_external_ip = var.omit_external_ip - use_internal_ip = var.omit_external_ip - subnetwork = var.subnetwork_name - network_project_id = var.network_project_id - scopes = var.scopes - source_image = var.source_image - source_image_family = var.source_image_family - source_image_project_id = var.source_image_project_id - ssh_username = var.ssh_username - tags = var.tags - use_iap = local.use_iap - use_os_login = var.use_os_login - winrm_username = local.winrm_username - winrm_insecure = local.winrm_insecure - winrm_use_ssl = local.winrm_use_ssl - zone = var.zone - labels = var.labels - metadata = local.metadata - startup_script_file = var.startup_script_file - wrap_startup_script = var.wrap_startup_script - state_timeout = var.state_timeout - image_storage_locations = var.image_storage_locations + communicator = local.communicator + project_id = var.project_id + image_name = local.image_name + image_family = local.image_family + image_labels = var.labels + machine_type = var.machine_type + accelerator_type = local.accelerator_type + accelerator_count = var.accelerator_count + on_host_maintenance = local.on_host_maintenance + disk_size = var.disk_size + disk_type = var.disk_type + omit_external_ip = var.omit_external_ip + use_internal_ip = var.omit_external_ip + subnetwork = var.subnetwork_name + network_project_id = var.network_project_id + scopes = var.scopes + source_image = var.source_image + source_image_family = var.source_image_family + source_image_project_id = var.source_image_project_id + ssh_username = var.ssh_username + tags = var.tags + use_iap = local.use_iap + use_os_login = var.use_os_login + winrm_username = local.winrm_username + winrm_insecure = local.winrm_insecure + winrm_use_ssl = local.winrm_use_ssl + zone = var.zone + labels = var.labels + metadata = local.metadata + startup_script_file = var.startup_script_file + wrap_startup_script = var.wrap_startup_script + state_timeout = var.state_timeout + image_storage_locations = var.image_storage_locations + enable_secure_boot = local.enable_secure_boot + enable_vtpm = local.enable_vtpm + enable_integrity_monitoring = local.enable_integrity_monitoring } build { diff --git a/modules/packer/custom-image/variables.pkr.hcl b/modules/packer/custom-image/variables.pkr.hcl index ed4350387c..6e455ef46a 100644 --- a/modules/packer/custom-image/variables.pkr.hcl +++ b/modules/packer/custom-image/variables.pkr.hcl @@ -242,3 +242,24 @@ EOD type = list(string) default = null } + +variable "enable_shielded_vm" { + type = bool + default = false + description = "Enable the Shielded VM configuration (var.shielded_instance_config)." +} + +variable "shielded_instance_config" { + description = "Shielded VM configuration for the instance (must set var.enabled_shielded_vm)" + type = object({ + enable_secure_boot = bool + enable_vtpm = bool + enable_integrity_monitoring = bool + }) + + default = { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } +} diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index fbbbac20c5..6ad0858540 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.23.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 2bc09fb5b5..679190c10d 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.22.1" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.23.0" } required_version = ">= 0.14.0" diff --git a/pkg/config/config.go b/pkg/config/config.go index 874094fce5..a8e3dd48a0 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -19,11 +19,11 @@ import ( "bytes" "fmt" "io/ioutil" - "log" "regexp" "sort" "strings" + "github.com/agext/levenshtein" "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" "gopkg.in/yaml.v3" @@ -35,25 +35,22 @@ const ( expectedVarFormat string = "$(vars.var_name) or $(module_id.output_name)" expectedModFormat string = "$(module_id) or $(group_id.module_id)" unexpectedConnectionKind string = "connectionKind must be useConnection or deploymentConnection" + maxHintDist int = 3 // Maximum levenshtein distance where we suggest a hint ) var errorMessages = map[string]string{ - // general - "appendToNonList": "cannot append to a setting whose type is not a list", // config "fileLoadError": "failed to read the input yaml", "yamlUnmarshalError": "failed to parse the blueprint in %s, check YAML syntax for errors, err=%w", "yamlMarshalError": "failed to export the configuration to a blueprint yaml file", "fileSaveError": "failed to write the expanded yaml", // expand - "missingSetting": "a required setting is missing from a module", - "settingsLabelType": "labels in module settings are not a map", - "invalidVar": "invalid variable definition in", - "invalidMod": "invalid module reference", - "varNotFound": "Could not find source of variable", - "intergroupOrder": "References to outputs from other groups must be to earlier groups", - "noOutput": "Output not found for a variable", - "cannotUsePacker": "Packer modules cannot be used by other modules", + "missingSetting": "a required setting is missing from a module", + "invalidVar": "invalid variable definition in", + "varNotFound": "Could not find source of variable", + "intergroupOrder": "References to outputs from other groups must be to earlier groups", + "noOutput": "Output not found for a variable", + "cannotUsePacker": "Packer modules cannot be used by other modules", // validator "emptyID": "a module id cannot be empty", "emptySource": "a module source cannot be empty", @@ -129,11 +126,32 @@ func (bp *Blueprint) Module(id ModuleID) (*Module, error) { return nil }) if mod == nil { - return nil, fmt.Errorf("%s: %s", errorMessages["invalidMod"], id) + return nil, UnknownModuleError{id} } return mod, nil } +// SuggestModuleIDHint return a correct spelling of given ModuleID id if one +// is close enough (based on maxHintDist) +func (bp Blueprint) SuggestModuleIDHint(id ModuleID) (string, bool) { + clMod := "" + minDist := -1 + bp.WalkModules(func(m *Module) error { + dist := levenshtein.Distance(string(m.ID), string(id), nil) + if minDist == -1.0 || dist < minDist { + minDist = dist + clMod = string(m.ID) + } + return nil + }) + + if clMod != "" && minDist <= maxHintDist { + return clMod, true + } + + return "", false +} + // ModuleGroup returns the group containing the module func (bp Blueprint) ModuleGroup(mod ModuleID) (DeploymentGroup, error) { for _, g := range bp.DeploymentGroups { @@ -143,7 +161,7 @@ func (bp Blueprint) ModuleGroup(mod ModuleID) (DeploymentGroup, error) { } } } - return DeploymentGroup{}, fmt.Errorf("%s: %s", errorMessages["invalidMod"], mod) + return DeploymentGroup{}, UnknownModuleError{mod} } // ModuleGroupOrDie returns the group containing the module; panics if unfound @@ -205,20 +223,6 @@ func (mk ModuleKind) String() string { return mk.kind } -type validatorName uint8 - -const ( - // Undefined will be default and potentially throw errors if used - Undefined validatorName = iota - testProjectExistsName - testRegionExistsName - testZoneExistsName - testModuleNotUsedName - testZoneInRegionName - testApisEnabledName - testDeploymentVariableNotUsedName -) - // this enum will be used to control how fatal validator failures will be // treated during blueprint creation const ( @@ -231,59 +235,13 @@ func isValidValidationLevel(level int) bool { return !(level > ValidationIgnore || level < ValidationError) } -func (v validatorName) String() string { - switch v { - case testProjectExistsName: - return "test_project_exists" - case testRegionExistsName: - return "test_region_exists" - case testZoneExistsName: - return "test_zone_exists" - case testZoneInRegionName: - return "test_zone_in_region" - case testApisEnabledName: - return "test_apis_enabled" - case testModuleNotUsedName: - return "test_module_not_used" - case testDeploymentVariableNotUsedName: - return "test_deployment_variable_not_used" - default: - return "unknown_validator" - } -} - -type validatorConfig struct { +// Validator defines a validation step to be run on a blueprint +type Validator struct { Validator string Inputs Dict `yaml:"inputs,omitempty"` Skip bool `yaml:"skip,omitempty"` } -func (v *validatorConfig) check(name validatorName, requiredInputs []string) error { - if v.Validator != name.String() { - return fmt.Errorf("passed wrong validator to %s implementation", name.String()) - } - - var errored bool - for _, inp := range requiredInputs { - if !v.Inputs.Has(inp) { - log.Printf("a required input %s was not provided to %s!", inp, v.Validator) - errored = true - } - } - - if errored { - return fmt.Errorf("at least one required input was not provided to %s", v.Validator) - } - - // ensure that no extra inputs were provided by comparing length - if len(requiredInputs) != len(v.Inputs.Items()) { - errStr := "only %v inputs %s should be provided to %s" - return fmt.Errorf(errStr, len(requiredInputs), requiredInputs, v.Validator) - } - - return nil -} - // ModuleID is a unique identifier for a module in a blueprint type ModuleID string @@ -317,10 +275,10 @@ func (m Module) InfoOrDie() modulereader.ModuleInfo { // unless it has been set to a non-default value; the implementation as an // integer is primarily for internal purposes even if it can be set in blueprint type Blueprint struct { - BlueprintName string `yaml:"blueprint_name"` - GhpcVersion string `yaml:"ghpc_version,omitempty"` - Validators []validatorConfig `yaml:"validators,omitempty"` - ValidationLevel int `yaml:"validation_level,omitempty"` + BlueprintName string `yaml:"blueprint_name"` + GhpcVersion string `yaml:"ghpc_version,omitempty"` + Validators []Validator `yaml:"validators,omitempty"` + ValidationLevel int `yaml:"validation_level,omitempty"` Vars Dict DeploymentGroups []DeploymentGroup `yaml:"deployment_groups"` TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` @@ -340,13 +298,7 @@ func (dc *DeploymentConfig) ExpandConfig() error { if err := validateBlueprint(dc.Config); err != nil { return err } - if err := dc.expand(); err != nil { - return err - } - if err := dc.executeValidators(); err != nil { - return err - } - return nil + return dc.expand() } func (bp *Blueprint) setGlobalLabels() { @@ -355,9 +307,9 @@ func (bp *Blueprint) setGlobalLabels() { } } -// listUnusedModules provides a list modules that are in the +// ListUnusedModules provides a list modules that are in the // "use" field, but not actually used. -func (m Module) listUnusedModules() ModuleIDs { +func (m Module) ListUnusedModules() ModuleIDs { used := map[ModuleID]bool{} // Recurse through objects/maps/lists checking each element for having `ProductOfModuleUse` mark. cty.Walk(m.Settings.AsObject(), func(p cty.Path, v cty.Value) (bool, error) { @@ -387,29 +339,36 @@ func GetUsedDeploymentVars(val cty.Value) []string { return res } -func (dc *DeploymentConfig) listUnusedDeploymentVariables() []string { +// ListUnusedVariables returns a list of variables that are defined but not used +func (bp Blueprint) ListUnusedVariables() []string { // these variables are required or automatically constructed and applied; // these should not be listed unused otherwise no blueprints are valid - var usedVars = map[string]bool{ + var used = map[string]bool{ "labels": true, "deployment_name": true, } - dc.Config.WalkModules(func(m *Module) error { + bp.WalkModules(func(m *Module) error { for _, v := range GetUsedDeploymentVars(m.Settings.AsObject()) { - usedVars[v] = true + used[v] = true } return nil }) - unusedVars := []string{} - for k := range dc.Config.Vars.Items() { - if _, ok := usedVars[k]; !ok { - unusedVars = append(unusedVars, k) + for _, v := range bp.Validators { + for _, v := range GetUsedDeploymentVars(v.Inputs.AsObject()) { + used[v] = true } } - return unusedVars + unused := []string{} + for k := range bp.Vars.Items() { + if _, ok := used[k]; !ok { + unused = append(unused, k) + } + } + + return unused } func checkMovedModule(source string) error { @@ -555,7 +514,7 @@ func validateBlueprint(bp Blueprint) error { // if no validator is present, adds one, marked as skipped. func (dc *DeploymentConfig) SkipValidator(name string) error { if dc.Config.Validators == nil { - dc.Config.Validators = []validatorConfig{} + dc.Config.Validators = []Validator{} } skipped := false for i, v := range dc.Config.Validators { @@ -565,7 +524,7 @@ func (dc *DeploymentConfig) SkipValidator(name string) error { } } if !skipped { - dc.Config.Validators = append(dc.Config.Validators, validatorConfig{Validator: name, Skip: true}) + dc.Config.Validators = append(dc.Config.Validators, Validator{Validator: name, Skip: true}) } return nil } @@ -634,6 +593,19 @@ func (bp *Blueprint) DeploymentName() (string, error) { return s, nil } +// ProjectID returns the project_id +func (bp Blueprint) ProjectID() (string, error) { + pid := "project_id" + if !bp.Vars.Has(pid) { + return "", BpError{Root.Vars, fmt.Errorf("%q variable is not specified", pid)} + } + v := bp.Vars.Get(pid) + if v.Type() != cty.String { + return "", BpError{Root.Vars.Dot(pid), fmt.Errorf("%q variable is not a string", pid)} + } + return v.AsString(), nil +} + // checkBlueprintName returns an error if blueprint_name does not comply with // requirements for correct GCP label values. func (bp *Blueprint) checkBlueprintName() error { @@ -767,7 +739,6 @@ func (bp *Blueprint) evalVars() error { } } } - bp.Vars = res return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index bd1a0728d9..ee43e66464 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -189,10 +189,6 @@ func getDeploymentConfigForTest() DeploymentConfig { dc := DeploymentConfig{Config: testBlueprint} setTestModuleInfo(testModule, testModuleInfo) setTestModuleInfo(testModuleWithLabels, testModuleInfo) - - // the next step simulates relevant step in ghpc expand - dc.addDefaultValidators() - return dc } @@ -341,8 +337,6 @@ func getMultiGroupDeploymentConfig() DeploymentConfig { DeploymentGroups: []DeploymentGroup{grp0, grp1}, }, } - - dc.addDefaultValidators() return dc } @@ -378,11 +372,6 @@ func getDeploymentConfigWithTestModuleEmptyKind() DeploymentConfig { // config.go func (s *MySuite) TestExpandConfig(c *C) { dc := getBasicDeploymentConfigWithTestModule() - for v := range dc.getValidators() { // skip all validators - dc.Config.Validators = append( - dc.Config.Validators, - validatorConfig{Validator: v, Skip: true}) - } c.Check(dc.ExpandConfig(), IsNil) } @@ -425,7 +414,7 @@ func (s *MySuite) TestCheckModulesAndGroups(c *C) { func (s *MySuite) TestListUnusedModules(c *C) { { // No modules in "use" m := Module{ID: "m"} - c.Check(m.listUnusedModules(), DeepEquals, ModuleIDs{}) + c.Check(m.ListUnusedModules(), DeepEquals, ModuleIDs{}) } { // Useful @@ -434,7 +423,7 @@ func (s *MySuite) TestListUnusedModules(c *C) { Use: ModuleIDs{"w"}, Settings: NewDict(map[string]cty.Value{ "x": AsProductOfModuleUse(cty.True, "w")})} - c.Check(m.listUnusedModules(), DeepEquals, ModuleIDs{}) + c.Check(m.ListUnusedModules(), DeepEquals, ModuleIDs{}) } { // Unused @@ -443,21 +432,21 @@ func (s *MySuite) TestListUnusedModules(c *C) { Use: ModuleIDs{"w", "u"}, Settings: NewDict(map[string]cty.Value{ "x": AsProductOfModuleUse(cty.True, "w")})} - c.Check(m.listUnusedModules(), DeepEquals, ModuleIDs{"u"}) + c.Check(m.ListUnusedModules(), DeepEquals, ModuleIDs{"u"}) } } -func (s *MySuite) TestListUnusedDeploymentVariables(c *C) { +func (s *MySuite) TestListUnusedVariables(c *C) { dc := getDeploymentConfigForTest() dc.applyGlobalVariables() - unusedVars := dc.listUnusedDeploymentVariables() + unusedVars := dc.Config.ListUnusedVariables() c.Assert(unusedVars, DeepEquals, []string{"project_id"}) dc = getMultiGroupDeploymentConfig() dc.applyGlobalVariables() - unusedVars = dc.listUnusedDeploymentVariables() + unusedVars = dc.Config.ListUnusedVariables() c.Assert(unusedVars, DeepEquals, []string{"unused_key"}) } @@ -788,64 +777,6 @@ func (s *MySuite) TestCheckMovedModules(c *C) { c.Assert(checkMovedModule("./community/modules/scheduler/cloud-batch-job"), NotNil) } -func (s *MySuite) TestValidatorConfigCheck(c *C) { - const vn = testProjectExistsName // some valid name - - { // FAIL: names mismatch - v := validatorConfig{Validator: "who_is_this"} - err := v.check(vn, []string{}) - c.Check(err, ErrorMatches, "passed wrong validator to test_project_exists implementation") - } - - { // OK: names match - v := validatorConfig{Validator: vn.String()} - c.Check(v.check(vn, []string{}), IsNil) - } - - { // OK: Inputs is equal to required inputs without regard to ordering - v := validatorConfig{ - Validator: vn.String(), - Inputs: NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal})} - c.Check(v.check(vn, []string{"in0", "in1"}), IsNil) - c.Check(v.check(vn, []string{"in1", "in0"}), IsNil) - } - - { // FAIL: inputs are a proper subset of required inputs - v := validatorConfig{ - Validator: vn.String(), - Inputs: NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal})} - err := v.check(vn, []string{"in0", "in1", "in2"}) - c.Check(err, ErrorMatches, missingRequiredInputRegex) - } - - { // FAIL: inputs intersect with required inputs but are not a proper subset - v := validatorConfig{ - Validator: vn.String(), - Inputs: NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in3": cty.NilVal})} - err := v.check(vn, []string{"in0", "in1", "in2"}) - c.Check(err, ErrorMatches, missingRequiredInputRegex) - } - - { // FAIL inputs are a proper superset of required inputs - v := validatorConfig{ - Validator: vn.String(), - Inputs: NewDict(map[string]cty.Value{ - "in0": cty.NilVal, - "in1": cty.NilVal, - "in2": cty.NilVal, - "in3": cty.NilVal})} - err := v.check(vn, []string{"in0", "in1", "in2"}) - c.Check(err, ErrorMatches, "only 3 inputs \\[in0 in1 in2\\] should be provided to test_project_exists") - } -} - func (s *MySuite) TestCheckBackends(c *C) { // Helper to create blueprint with backend blocks only (first one is defaults) // and run checkBackends. @@ -930,42 +861,42 @@ func (s *MySuite) TestSkipValidator(c *C) { { dc := DeploymentConfig{Config: Blueprint{Validators: nil}} c.Check(dc.SkipValidator("zebra"), IsNil) - c.Check(dc.Config.Validators, DeepEquals, []validatorConfig{ + c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}}) } { - dc := DeploymentConfig{Config: Blueprint{Validators: []validatorConfig{ + dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}}}} c.Check(dc.SkipValidator("zebra"), IsNil) - c.Check(dc.Config.Validators, DeepEquals, []validatorConfig{ + c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) } { - dc := DeploymentConfig{Config: Blueprint{Validators: []validatorConfig{ + dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra"}}}} c.Check(dc.SkipValidator("zebra"), IsNil) - c.Check(dc.Config.Validators, DeepEquals, []validatorConfig{ + c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) } { - dc := DeploymentConfig{Config: Blueprint{Validators: []validatorConfig{ + dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}}} c.Check(dc.SkipValidator("zebra"), IsNil) - c.Check(dc.Config.Validators, DeepEquals, []validatorConfig{ + c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "pony"}, {Validator: "zebra", Skip: true}}) } { - dc := DeploymentConfig{Config: Blueprint{Validators: []validatorConfig{ + dc := DeploymentConfig{Config: Blueprint{Validators: []Validator{ {Validator: "zebra"}, {Validator: "pony"}, {Validator: "zebra"}}}} c.Check(dc.SkipValidator("zebra"), IsNil) - c.Check(dc.Config.Validators, DeepEquals, []validatorConfig{ + c.Check(dc.Config.Validators, DeepEquals, []Validator{ {Validator: "zebra", Skip: true}, {Validator: "pony"}, {Validator: "zebra", Skip: true}}) @@ -1033,8 +964,24 @@ func (s *MySuite) TestValidateModuleSettingReference(c *C) { // FAIL. missing output c.Check(vld(bp, mod22, ModuleRef("mod21", "kale")), NotNil) - // Fail. packer module + // FAIL. packer module c.Check(vld(bp, mod21, ModuleRef("pkr", "outPkr")), NotNil) + + // FAIL. get global hint + mod := ModuleID("var") + unkModErr := UnknownModuleError{mod} + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{"Did you mean \"vars\"?", unkModErr}), Equals, true) + + // FAIL. get module ID hint + mod = ModuleID("pkp") + unkModErr = UnknownModuleError{mod} + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, true) + + // FAIL. get no hint + mod = ModuleID("test") + unkModErr = UnknownModuleError{mod} + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), HintError{fmt.Sprintf("Did you mean \"%s\"?", string(pkr.ID)), unkModErr}), Equals, false) + c.Check(errors.Is(vld(bp, mod11, ModuleRef(mod, "kale")), unkModErr), Equals, true) } func (s *MySuite) TestValidateModuleSettingReferences(c *C) { diff --git a/pkg/config/errors.go b/pkg/config/errors.go index 35775dad42..ea6d34cf55 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -33,6 +33,23 @@ func (e BpError) Unwrap() error { return e.Err } +// HintError wraps another error to suggest other values +type HintError struct { + Hint string + Err error +} + +func (e HintError) Error() string { + if len(e.Hint) > 0 { + return fmt.Sprintf("%s - %s", e.Err, e.Hint) + } + return e.Err.Error() +} + +func (e HintError) Unwrap() error { + return e.Err +} + // InvalidSettingError signifies a problem with the supplied setting name in a // module definition. type InvalidSettingError struct { @@ -43,6 +60,15 @@ func (err *InvalidSettingError) Error() string { return fmt.Sprintf("invalid setting provided to a module, cause: %v", err.cause) } +// UnknownModuleError signifies a problem with the supplied module name. +type UnknownModuleError struct { + ID ModuleID +} + +func (e UnknownModuleError) Error() string { + return fmt.Sprintf("invalid module id: \"%s\"", e.ID) +} + // Errors is an error wrapper to combine multiple errors type Errors struct { Errors []error @@ -101,3 +127,8 @@ func (e *Errors) At(path Path, err error) *Errors { } return e.Add(BpError{Path: path, Err: err}) } + +// Any returns true if there are any errors +func (e *Errors) Any() bool { + return len(e.Errors) > 0 +} diff --git a/pkg/config/expand.go b/pkg/config/expand.go index c82069db09..3a149446e9 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -15,6 +15,7 @@ package config import ( + "errors" "fmt" "regexp" "strings" @@ -23,6 +24,7 @@ import ( "hpc-toolkit/pkg/modulereader" + "github.com/agext/levenshtein" "github.com/zclconf/go-cty/cty" "golang.org/x/exp/maps" "golang.org/x/exp/slices" @@ -49,7 +51,6 @@ func (dc *DeploymentConfig) expand() error { return err } dc.expandBackends() - dc.addDefaultValidators() dc.combineLabels() if err := dc.applyUseModules(); err != nil { @@ -308,6 +309,9 @@ func AutomaticOutputName(outputName string, moduleID ModuleID) string { func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { to, err := bp.Module(toID) if err != nil { + if hint, ok := bp.SuggestModuleIDHint(toID); ok { + return HintError{fmt.Sprintf("Did you mean \"%s\"?", hint), err} + } return err } @@ -339,6 +343,10 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error } if err := validateModuleReference(bp, mod, r.Module); err != nil { + var unkModErr UnknownModuleError + if errors.As(err, &unkModErr) && levenshtein.Distance(string(unkModErr.ID), "vars", nil) <= 2 { + return HintError{"Did you mean \"vars\"?", unkModErr} + } return err } tm, _ := bp.Module(r.Module) // Shouldn't error if validateModuleReference didn't @@ -363,85 +371,6 @@ func hasVariable(str string) bool { return anyVariableExp.MatchString(str) } -// this function adds default validators to the blueprint. -// default validators are only added for global variables that exist -func (dc *DeploymentConfig) addDefaultValidators() { - if dc.Config.Validators == nil { - dc.Config.Validators = []validatorConfig{} - } - - projectIDExists := dc.Config.Vars.Has("project_id") - projectRef := GlobalRef("project_id").AsExpression().AsValue() - - regionExists := dc.Config.Vars.Has("region") - regionRef := GlobalRef("region").AsExpression().AsValue() - - zoneExists := dc.Config.Vars.Has("zone") - zoneRef := GlobalRef("zone").AsExpression().AsValue() - - defaults := []validatorConfig{ - {Validator: testModuleNotUsedName.String()}, - {Validator: testDeploymentVariableNotUsedName.String()}} - - // always add the project ID validator before subsequent validators that can - // only succeed if credentials can access the project. If the project ID - // validator fails, all remaining validators are not executed. - if projectIDExists { - defaults = append(defaults, validatorConfig{ - Validator: testProjectExistsName.String(), - Inputs: NewDict(map[string]cty.Value{"project_id": projectRef}), - }) - } - - // it is safe to run this validator even if vars.project_id is undefined; - // it will likely fail but will do so helpfully to the user - defaults = append(defaults, - validatorConfig{Validator: "test_apis_enabled"}) - - if projectIDExists && regionExists { - defaults = append(defaults, validatorConfig{ - Validator: testRegionExistsName.String(), - Inputs: NewDict(map[string]cty.Value{ - "project_id": projectRef, - "region": regionRef, - }, - )}) - } - - if projectIDExists && zoneExists { - defaults = append(defaults, validatorConfig{ - Validator: testZoneExistsName.String(), - Inputs: NewDict(map[string]cty.Value{ - "project_id": projectRef, - "zone": zoneRef, - }), - }) - } - - if projectIDExists && regionExists && zoneExists { - defaults = append(defaults, validatorConfig{ - Validator: testZoneInRegionName.String(), - Inputs: NewDict(map[string]cty.Value{ - "project_id": projectRef, - "region": regionRef, - "zone": zoneRef, - }), - }) - } - - used := map[string]bool{} - for _, v := range dc.Config.Validators { - used[v.Validator] = true - } - - for _, v := range defaults { - if used[v.Validator] { - continue - } - dc.Config.Validators = append(dc.Config.Validators, v) - } -} - // FindAllIntergroupReferences finds all intergroup references within the group func (dg DeploymentGroup) FindAllIntergroupReferences(bp Blueprint) []Reference { igcRefs := map[Reference]bool{} diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 12e96ed563..db413f1527 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -15,6 +15,7 @@ package config import ( + "errors" "fmt" "hpc-toolkit/pkg/modulereader" @@ -230,7 +231,10 @@ func (s *MySuite) TestApplyUseModules(c *C) { // Use ID doesn't exists (fail) g.Modules[len(g.Modules)-1].ID = "wrongID" - c.Assert(dc.applyUseModules(), ErrorMatches, fmt.Sprintf("%s: %s", errorMessages["invalidMod"], used.ID)) + err := dc.applyUseModules() + unkModErr := UnknownModuleError{used.ID} + c.Check(errors.Is(err, unkModErr), Equals, true) + c.Check(errors.Is(err, HintError{string(using.ID), unkModErr}), Equals, false) } { // test multigroup deployment with config that has a known good match diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 7acca9f788..b675472f7b 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -18,91 +18,16 @@ package config import ( "fmt" - "log" "regexp" "strings" "hpc-toolkit/pkg/modulereader" - "hpc-toolkit/pkg/validators" "github.com/pkg/errors" "github.com/zclconf/go-cty/cty" - "golang.org/x/exp/maps" ) -const ( - validationWarningMsg = "Validation failures were treated as a warning, continuing to create blueprint." - validationErrorMsg = "validation failed due to the issues listed above" - funcErrorMsgTemplate = "validator %s failed" - maxLabels = 64 -) - -// performs validation of global variables -func (dc DeploymentConfig) executeValidators() error { - var errored, warned bool - implementedValidators := dc.getValidators() - - if dc.Config.ValidationLevel == ValidationIgnore { - return nil - } - - for _, validator := range dc.Config.Validators { - if validator.Skip { - continue - } - - f, ok := implementedValidators[validator.Validator] - if !ok { - errored = true - log.Printf("%s is not an implemented validator", validator.Validator) - continue - } - - if err := f(validator); err != nil { - var prefix string - switch dc.Config.ValidationLevel { - case ValidationWarning: - warned = true - prefix = "warning: " - default: - errored = true - prefix = "error: " - } - log.Print(prefix, err) - log.Println() - - // do not bother running further validators if project ID could not be found - if validator.Validator == testProjectExistsName.String() { - break - } - } - - } - - if warned || errored { - log.Println("One or more blueprint validators has failed. See messages above for suggested") - log.Println("actions. General troubleshooting guidance and instructions for configuring") - log.Println("validators are shown below.") - log.Println("") - log.Println("- https://goo.gle/hpc-toolkit-troubleshooting") - log.Println("- https://goo.gle/hpc-toolkit-validation") - log.Println("") - log.Println("Validators can be silenced or treated as warnings or errors:") - log.Println("") - log.Println("- https://goo.gle/hpc-toolkit-validation-levels") - log.Println("") - } - - if warned { - log.Println(validationWarningMsg) - log.Println("") - } - - if errored { - return fmt.Errorf(validationErrorMsg) - } - return nil -} +const maxLabels = 64 func validateGlobalLabels(vars Dict) error { if !vars.Has("labels") { @@ -175,6 +100,9 @@ func validateModule(p modulePath, m Module, bp Blueprint) error { if m.ID == "" { errs.At(p.ID, fmt.Errorf(errorMessages["emptyID"])) } + if m.ID == "vars" { // invalid module ID + errs.At(p.ID, errors.New("module id cannot be 'vars'")) + } return errs. Add(validateSettings(p, m, info)). Add(validateOutputs(p, m, info)). @@ -240,172 +168,3 @@ func validateSettings( } return errs.OrNil() } - -func (dc *DeploymentConfig) getValidators() map[string]func(validatorConfig) error { - allValidators := map[string]func(validatorConfig) error{ - testApisEnabledName.String(): dc.testApisEnabled, - testProjectExistsName.String(): dc.testProjectExists, - testRegionExistsName.String(): dc.testRegionExists, - testZoneExistsName.String(): dc.testZoneExists, - testZoneInRegionName.String(): dc.testZoneInRegion, - testModuleNotUsedName.String(): dc.testModuleNotUsed, - testDeploymentVariableNotUsedName.String(): dc.testDeploymentVariableNotUsed, - } - return allValidators -} - -func (dc *DeploymentConfig) testApisEnabled(c validatorConfig) error { - if err := c.check(testApisEnabledName, []string{}); err != nil { - return err - } - - pv := dc.Config.Vars.Get("project_id") - if pv.Type() != cty.String { - return fmt.Errorf("the deployment variable `project_id` is either not set or is not a string") - } - - apis := map[string]bool{} - dc.Config.WalkModules(func(m *Module) error { - for _, api := range m.InfoOrDie().RequiredApis { - apis[api] = true - } - return nil - }) - - if err := validators.TestApisEnabled(pv.AsString(), maps.Keys(apis)); err != nil { - log.Println(err) - return fmt.Errorf(funcErrorMsgTemplate, testApisEnabledName.String()) - } - return nil -} - -func (dc *DeploymentConfig) testProjectExists(c validatorConfig) error { - funcName := testProjectExistsName.String() - funcErrorMsg := fmt.Sprintf(funcErrorMsgTemplate, funcName) - - if err := c.check(testProjectExistsName, []string{"project_id"}); err != nil { - return err - } - m, err := evalValidatorInputsAsStrings(c.Inputs, dc.Config) - if err != nil { - log.Print(funcErrorMsg) - return err - } - - if err = validators.TestProjectExists(m["project_id"]); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsg) - } - return nil -} - -func (dc *DeploymentConfig) testRegionExists(c validatorConfig) error { - funcName := testRegionExistsName.String() - funcErrorMsg := fmt.Sprintf(funcErrorMsgTemplate, funcName) - - if err := c.check(testRegionExistsName, []string{"project_id", "region"}); err != nil { - return err - } - m, err := evalValidatorInputsAsStrings(c.Inputs, dc.Config) - if err != nil { - log.Print(funcErrorMsg) - return err - } - - if err = validators.TestRegionExists(m["project_id"], m["region"]); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsg) - } - return nil -} - -func (dc *DeploymentConfig) testZoneExists(c validatorConfig) error { - funcName := testZoneExistsName.String() - funcErrorMsg := fmt.Sprintf(funcErrorMsgTemplate, funcName) - - if err := c.check(testZoneExistsName, []string{"project_id", "zone"}); err != nil { - return err - } - m, err := evalValidatorInputsAsStrings(c.Inputs, dc.Config) - if err != nil { - log.Print(funcErrorMsg) - return err - } - - if err = validators.TestZoneExists(m["project_id"], m["zone"]); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsg) - } - return nil -} - -func (dc *DeploymentConfig) testZoneInRegion(c validatorConfig) error { - funcName := testZoneInRegionName.String() - funcErrorMsg := fmt.Sprintf(funcErrorMsgTemplate, funcName) - - if err := c.check(testZoneInRegionName, []string{"project_id", "region", "zone"}); err != nil { - return err - } - m, err := evalValidatorInputsAsStrings(c.Inputs, dc.Config) - if err != nil { - log.Print(funcErrorMsg) - return err - } - - if err = validators.TestZoneInRegion(m["project_id"], m["zone"], m["region"]); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsg) - } - return nil -} - -func (dc *DeploymentConfig) testModuleNotUsed(c validatorConfig) error { - if err := c.check(testModuleNotUsedName, []string{}); err != nil { - return err - } - - acc := map[string][]string{} - dc.Config.WalkModules(func(m *Module) error { - ids := m.listUnusedModules() - sids := make([]string, len(ids)) - for i, id := range ids { - sids[i] = string(id) - } - acc[string(m.ID)] = sids - return nil - }) - - if err := validators.TestModuleNotUsed(acc); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsgTemplate, testModuleNotUsedName.String()) - } - return nil -} - -func (dc *DeploymentConfig) testDeploymentVariableNotUsed(c validatorConfig) error { - if err := c.check(testDeploymentVariableNotUsedName, []string{}); err != nil { - return err - } - - if err := validators.TestDeploymentVariablesNotUsed(dc.listUnusedDeploymentVariables()); err != nil { - log.Print(err) - return fmt.Errorf(funcErrorMsgTemplate, testDeploymentVariableNotUsedName.String()) - } - return nil -} - -// Helper function to evaluate validator inputs and make sure that all values are strings. -func evalValidatorInputsAsStrings(inputs Dict, bp Blueprint) (map[string]string, error) { - ev, err := inputs.Eval(bp) - if err != nil { - return nil, err - } - ms := map[string]string{} - for k, v := range ev.Items() { - if v.Type() != cty.String { - return nil, fmt.Errorf("validator inputs must be strings, %s is a %s", k, v.Type()) - } - ms[k] = v.AsString() - } - return ms, nil -} diff --git a/pkg/config/validator_test.go b/pkg/config/validator_test.go index 7355e9ca93..f85ba8a38c 100644 --- a/pkg/config/validator_test.go +++ b/pkg/config/validator_test.go @@ -21,13 +21,6 @@ import ( . "gopkg.in/check.v1" ) -const ( - tooManyInputRegex = "only [0-9]+ inputs \\[.*\\] should be provided to .*" - missingRequiredInputRegex = "at least one required input was not provided to .*" - passedWrongValidatorRegex = "passed wrong validator to .*" - undefinedGlobalVariableRegex = ".* was not defined$" -) - func (s *MySuite) TestValidateVars(c *C) { { // Success vars := Dict{} @@ -100,6 +93,15 @@ func (s *MySuite) TestValidateModule(c *C) { c.Check(err, NotNil) } + { // Catch invalid ID + err := validateModule(p, Module{ + ID: "vars", + Source: "green", + Kind: TerraformKind, + }, dummyBp) + c.Check(err, NotNil) + } + { // Catch no Source err := validateModule(p, Module{ID: "bond"}, dummyBp) c.Check(err, NotNil) @@ -162,166 +164,3 @@ func (s *MySuite) TestValidateOutputs(c *C) { c.Check(validateOutputs(p, mod, info), NotNil) } } - -func (s *MySuite) TestAddDefaultValidators(c *C) { - dc := getDeploymentConfigForTest() - dc.addDefaultValidators() - c.Assert(dc.Config.Validators, HasLen, 4) - - dc.Config.Validators = nil - dc.Config.Vars.Set("region", cty.StringVal("us-central1")) - dc.addDefaultValidators() - c.Assert(dc.Config.Validators, HasLen, 5) - - dc.Config.Validators = nil - dc.Config.Vars.Set("zone", cty.StringVal("us-central1-c")) - dc.addDefaultValidators() - c.Assert(dc.Config.Validators, HasLen, 7) -} - -func (s *MySuite) TestExecuteValidators(c *C) { - dc := getDeploymentConfigForTest() - dc.Config.Validators = []validatorConfig{ - {Validator: "unimplemented-validator"}} - - err := dc.executeValidators() - c.Assert(err, ErrorMatches, validationErrorMsg) - - dc.Config.Validators = []validatorConfig{ - {Validator: testProjectExistsName.String()}} - - err = dc.executeValidators() - c.Assert(err, ErrorMatches, validationErrorMsg) -} - -func (s *MySuite) TestApisEnabledValidator(c *C) { - var err error - dc := getDeploymentConfigForTest() - emptyValidator := validatorConfig{} - - // test validator fails for config without validator id - err = dc.testApisEnabled(emptyValidator) - c.Assert(err, ErrorMatches, passedWrongValidatorRegex) - - apisEnabledValidator := validatorConfig{ - Validator: testApisEnabledName.String()} - - // this test succeeds because the list of required APIs for the test - // Deployment Config is empty; no actual API calls get made in this case. - // When full automation of required API detection is implemented, we may - // need to modify this test - err = dc.testApisEnabled(apisEnabledValidator) - c.Assert(err, IsNil) - - // this validator reads blueprint directly so 1 inputs should fail - apisEnabledValidator.Inputs.Set("foo", cty.StringVal("bar")) - err = dc.testApisEnabled(apisEnabledValidator) - c.Assert(err, ErrorMatches, tooManyInputRegex) -} - -// this function tests that the "gateway" functions in this package for our -// validators fail under various conditions; it does not test the actual Cloud -// API calls in the validators package; we will defer success testing until the -// development of mock functions for Cloud API calls -func (s *MySuite) TestProjectExistsValidator(c *C) { - var err error - dc := getDeploymentConfigForTest() - emptyValidator := validatorConfig{} - - // test validator fails for config without validator id - err = dc.testProjectExists(emptyValidator) - c.Assert(err, ErrorMatches, passedWrongValidatorRegex) - - // test validator fails for config without any inputs - projectValidator := validatorConfig{Validator: testProjectExistsName.String()} - err = dc.testProjectExists(projectValidator) - c.Assert(err, ErrorMatches, missingRequiredInputRegex) - - // test validators fail when input global variables are undefined - projectValidator.Inputs.Set("project_id", MustParseExpression("var.undefined").AsValue()) - c.Assert(dc.testProjectExists(projectValidator), NotNil) - - // TODO: implement a mock client to test success of test_project_exists -} - -func (s *MySuite) TestRegionExistsValidator(c *C) { - var err error - dc := getDeploymentConfigForTest() - emptyValidator := validatorConfig{} - - // test validator fails for config without validator id - err = dc.testRegionExists(emptyValidator) - c.Assert(err, ErrorMatches, passedWrongValidatorRegex) - - // test validator fails for config without any inputs - regionValidator := validatorConfig{Validator: testRegionExistsName.String()} - err = dc.testRegionExists(regionValidator) - c.Assert(err, ErrorMatches, missingRequiredInputRegex) - - // test validators fail when input global variables are undefined - regionValidator.Inputs. - Set("project_id", MustParseExpression("var.project_id").AsValue()). - Set("region", MustParseExpression("var.region").AsValue()) - c.Assert(dc.testRegionExists(regionValidator), NotNil) - - dc.Config.Vars.Set("project_id", cty.StringVal("invalid-project")) - c.Assert(dc.testRegionExists(regionValidator), NotNil) - - // TODO: implement a mock client to test success of test_region_exists -} - -func (s *MySuite) TestZoneExistsValidator(c *C) { - var err error - dc := getDeploymentConfigForTest() - emptyValidator := validatorConfig{} - - // test validator fails for config without validator id - err = dc.testZoneExists(emptyValidator) - c.Assert(err, ErrorMatches, passedWrongValidatorRegex) - - // test validator fails for config without any inputs - zoneValidator := validatorConfig{Validator: testZoneExistsName.String()} - err = dc.testZoneExists(zoneValidator) - c.Assert(err, ErrorMatches, missingRequiredInputRegex) - - // test validators fail when input global variables are undefined - zoneValidator.Inputs. - Set("project_id", MustParseExpression("var.project_id").AsValue()). - Set("zone", MustParseExpression("var.zone").AsValue()) - c.Assert(dc.testZoneExists(zoneValidator), NotNil) - - dc.Config.Vars.Set("project_id", cty.StringVal("invalid-project")) - c.Assert(dc.testZoneExists(zoneValidator), NotNil) - - // TODO: implement a mock client to test success of test_zone_exists -} - -func (s *MySuite) TestZoneInRegionValidator(c *C) { - var err error - dc := getDeploymentConfigForTest() - emptyValidator := validatorConfig{} - - // test validator fails for config without validator id - err = dc.testZoneInRegion(emptyValidator) - c.Assert(err, ErrorMatches, passedWrongValidatorRegex) - - // test validator fails for config without any inputs - zoneInRegionValidator := validatorConfig{Validator: testZoneInRegionName.String()} - err = dc.testZoneInRegion(zoneInRegionValidator) - c.Assert(err, ErrorMatches, missingRequiredInputRegex) - - // test validators fail when input global variables are undefined - zoneInRegionValidator.Inputs. - Set("project_id", MustParseExpression("var.project_id").AsValue()). - Set("region", MustParseExpression("var.region").AsValue()). - Set("zone", MustParseExpression("var.zone").AsValue()) - c.Assert(dc.testZoneInRegion(zoneInRegionValidator), NotNil) - - dc.Config.Vars.Set("project_id", cty.StringVal("invalid-project")) - c.Assert(dc.testZoneInRegion(zoneInRegionValidator), NotNil) - - dc.Config.Vars.Set("zone", cty.StringVal("invalid-zone")) - c.Assert(dc.testZoneInRegion(zoneInRegionValidator), NotNil) - - // TODO: implement a mock client to test success of test_zone_in_region -} diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go index 4efa6f8275..f825b264dc 100644 --- a/pkg/modulewriter/tfversions.go +++ b/pkg/modulewriter/tfversions.go @@ -21,11 +21,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.76.0" + version = "~> 4.78.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.76.0" + version = "~> 4.78.0" } } } diff --git a/pkg/validators/quota.go b/pkg/validators/quota.go index 5e490428ef..f742287745 100644 --- a/pkg/validators/quota.go +++ b/pkg/validators/quota.go @@ -17,21 +17,26 @@ package validators import ( "context" "fmt" + "hpc-toolkit/pkg/config" + "strings" "time" + "github.com/zclconf/go-cty/cty" + "github.com/zclconf/go-cty/cty/convert" + "github.com/zclconf/go-cty/cty/gocty" cm "google.golang.org/api/monitoring/v3" sub "google.golang.org/api/serviceusage/v1beta1" ) // ResourceRequirement represents an amount of desired resource. type ResourceRequirement struct { - Consumer string // e.g. "projects/myprojectid"" - Service string // e.g. "compute.googleapis.com" - Metric string // e.g. "compute.googleapis.com/disks_total_storage" - Required int64 - Dimensions map[string]string // e.g. {"region": "us-central1"} + Consumer string `cty:"consumer"` // e.g. "projects/myprojectid"" + Service string `cty:"service"` // e.g. "compute.googleapis.com" + Metric string `cty:"metric"` // e.g. "compute.googleapis.com/disks_total_storage" + Required int64 `cty:"required"` + Dimensions map[string]string `cty:"dimensions"` // e.g. {"region": "us-central1"} // How this requirement should be aggregated with other requirements in the same bucket. - Aggregation string + Aggregation string `cty:"aggregation"` } // InBucket returns true if the quota is in the QuotaBucket. @@ -51,15 +56,23 @@ type QuotaError struct { Metric string Dimensions map[string]string EffectiveLimit int64 + Usage int64 Requested int64 } func (e QuotaError) Error() string { - return fmt.Sprintf("QuotaError: %#v", e) + loc := "" + if len(e.Dimensions) > 0 { + loc = fmt.Sprintf(" in %v", e.Dimensions) + } + rhs := fmt.Sprintf("requested=%d", e.Requested) + if e.Usage > 0 { + rhs = fmt.Sprintf("requested=%d + usage=%d", e.Requested, e.Usage) + } + return fmt.Sprintf("not enough quota for resource %q%s, limit=%d < %s", e.Metric, loc, e.EffectiveLimit, rhs) } -// ValidateQuotas validates the resource requirements. -func ValidateQuotas(rs []ResourceRequirement) ([]QuotaError, error) { +func validateResourceRequirements(rs []ResourceRequirement, up *usageProvider) ([]QuotaError, error) { qe := []QuotaError{} // Group by Consumer and Service type gk struct { @@ -78,7 +91,7 @@ func ValidateQuotas(rs []ResourceRequirement) ([]QuotaError, error) { if err != nil { return qe, err } - qse, err := validateServiceLimits(g, ls) + qse, err := validateServiceLimits(g, ls, up) if err != nil { return qe, err } @@ -88,7 +101,7 @@ func ValidateQuotas(rs []ResourceRequirement) ([]QuotaError, error) { return qe, nil } -func validateServiceLimits(rs []ResourceRequirement, ls []*sub.ConsumerQuotaMetric) ([]QuotaError, error) { +func validateServiceLimits(rs []ResourceRequirement, ls []*sub.ConsumerQuotaMetric, up *usageProvider) ([]QuotaError, error) { // Group by Metric and Aggregation type gk struct { Metric string @@ -119,14 +132,14 @@ func validateServiceLimits(rs []ResourceRequirement, ls []*sub.ConsumerQuotaMetr } for _, limit := range ml { - qle := validateLimit(g, limit, agg) + qle := validateLimit(g, limit, up, agg) qe = append(qe, qle...) } } return qe, nil } -func validateLimit(rs []ResourceRequirement, limit *sub.ConsumerQuotaLimit, agg aggFn) []QuotaError { +func validateLimit(rs []ResourceRequirement, limit *sub.ConsumerQuotaLimit, up *usageProvider, agg aggFn) []QuotaError { qe := []QuotaError{} for _, bucket := range limit.QuotaBuckets { vs := []int64{} @@ -138,9 +151,10 @@ func validateLimit(rs []ResourceRequirement, limit *sub.ConsumerQuotaLimit, agg if len(vs) == 0 { continue } + usage := up.Usage(limit.Metric, bucket.Dimensions["region"], bucket.Dimensions["zone"]) required := agg(vs) for _, r := range required { - if !satisfied(r, bucket.EffectiveLimit) { + if !satisfied(r+usage, bucket.EffectiveLimit) { r0 := rs[0] // all should have the same consumer, service and metric qe = append(qe, QuotaError{ Consumer: r0.Consumer, @@ -148,6 +162,7 @@ func validateLimit(rs []ResourceRequirement, limit *sub.ConsumerQuotaLimit, agg Metric: r0.Metric, Dimensions: bucket.Dimensions, EffectiveLimit: bucket.EffectiveLimit, + Usage: usage, Requested: r, }) } @@ -250,7 +265,7 @@ func newUsageProvider(projectID string) (usageProvider, error) { u := map[usageKey]int64{} err = s.Projects.TimeSeries.List("projects/"+projectID). - Filter(`metric.type="serviceruntime.googleapis.com/quota/allocation/usage" resource.type="consumer_quota"`). + Filter(`metric.type="serviceruntime.googleapis.com/quota/allocation/usage"`). IntervalEndTime(time.Now().Format(time.RFC3339)). // Quota usage metrics get duplicated once a day IntervalStartTime(time.Now().Add(-24*time.Hour).Format(time.RFC3339)). @@ -271,3 +286,121 @@ func newUsageProvider(projectID string) (usageProvider, error) { } return usageProvider{u}, nil } + +type rrInputs struct { + Requirements []ResourceRequirement `cty:"requirements"` + IgnoreUsage bool `cty:"ignore_usage"` +} + +func ifNull(v cty.Value, d cty.Value) cty.Value { + if v.IsNull() { + return d + } + return v +} + +func extractServiceName(metric string) (string, error) { + // metric is in the form of "service.googleapis.com/metric" + // we want to extract the "service.googleapis.com" part + parts := strings.Split(metric, "/") + if len(parts) < 2 { + return "", fmt.Errorf("can not deduce service from metric %q", metric) + } + return parts[0], nil +} + +func parseResourceRequirementsInputs(bp config.Blueprint, inputs config.Dict) (rrInputs, error) { + // sanitize inputs dict by matching with type + rty := cty.ObjectWithOptionalAttrs(map[string]cty.Type{ + "metric": cty.String, + "service": cty.String, + "consumer": cty.String, + "required": cty.Number, + "aggregation": cty.String, + "dimensions": cty.Map(cty.String), + }, + /*optional=*/ []string{"service", "consumer", "aggregation", "dimensions"}) + ity := cty.ObjectWithOptionalAttrs(map[string]cty.Type{ + "requirements": cty.List(rty), + "ignore_usage": cty.Bool, + }, + /*optional=*/ []string{"ignore_usage"}) + clean, err := convert.Convert(inputs.AsObject(), ity) + if err != nil { + return rrInputs{}, err + } + + // fill in default values + ignoreUsage := ifNull(clean.GetAttr("ignore_usage"), cty.False) + projectID, err := bp.ProjectID() + if err != nil { + return rrInputs{}, err + } + reqs := []cty.Value{} + rit := clean.GetAttr("requirements").ElementIterator() + for rit.Next() { + _, r := rit.Element() + defConsumer := fmt.Sprintf("projects/%s", projectID) + defService, err := extractServiceName(r.GetAttr("metric").AsString()) + if err != nil { + return rrInputs{}, err + } + defDims := map[string]cty.Value{} + if bp.Vars.Has("region") { + defDims["region"] = bp.Vars.Get("region") + } + if bp.Vars.Has("zone") { + defDims["zone"] = bp.Vars.Get("zone") + } + defDimsVal := cty.MapValEmpty(cty.String) + if len(defDims) > 0 { + defDimsVal = cty.MapVal(defDims) + } + + reqs = append(reqs, cty.ObjectVal(map[string]cty.Value{ + "metric": r.GetAttr("metric"), + "service": ifNull(r.GetAttr("service"), cty.StringVal(defService)), + "consumer": ifNull(r.GetAttr("consumer"), cty.StringVal(defConsumer)), + "required": r.GetAttr("required"), + "aggregation": ifNull(r.GetAttr("aggregation"), cty.StringVal("SUM")), + "dimensions": ifNull(r.GetAttr("dimensions"), defDimsVal), + })) + } + + reqsVal := cty.ListValEmpty(rty) + if len(reqs) > 0 { + reqsVal = cty.ListVal(reqs) + } + + full := cty.ObjectVal(map[string]cty.Value{ + "requirements": reqsVal, + "ignore_usage": ignoreUsage, + }) + + var s rrInputs + return s, gocty.FromCtyValue(full, &s) +} + +func testResourceRequirements(bp config.Blueprint, inputs config.Dict) error { + in, err := parseResourceRequirementsInputs(bp, inputs) + if err != nil { + return err + } + errs := config.Errors{} + up := usageProvider{} + if !in.IgnoreUsage { + p, err := bp.ProjectID() + errs.Add(err) + if p != "" { + up, err = newUsageProvider(p) + errs.Add(err) // don't terminate fallback to ignore usage + } + } + + qerrs, err := validateResourceRequirements(in.Requirements, &up) + for _, qe := range qerrs { + errs.Add(qe) + } + errs.Add(err) + return errs.OrNil() +} diff --git a/pkg/validators/quota_test.go b/pkg/validators/quota_test.go index 9704f495d4..5bdeb61b02 100644 --- a/pkg/validators/quota_test.go +++ b/pkg/validators/quota_test.go @@ -16,11 +16,14 @@ package validators import ( "fmt" + "hpc-toolkit/pkg/config" "sort" "testing" "github.com/google/go-cmp/cmp" + "github.com/zclconf/go-cty/cty" sub "google.golang.org/api/serviceusage/v1beta1" + "gopkg.in/yaml.v3" ) func TestAggregation(t *testing.T) { @@ -146,10 +149,11 @@ func TestValidateServiceLimits(t *testing.T) { Aggregation: "SUM", }, } + up := usageProvider{} want := []QuotaError{ - {Metric: "pony", Dimensions: nil, EffectiveLimit: 5, Requested: 11}, {Metric: "pony", Dimensions: map[string]string{"green": "eggs"}, EffectiveLimit: 3, Requested: 4}, + {Metric: "pony", Dimensions: nil, EffectiveLimit: 5, Requested: 11}, } got, err := validateServiceLimits(quotas, []*sub.ConsumerQuotaMetric{ { @@ -157,7 +161,7 @@ func TestValidateServiceLimits(t *testing.T) { ConsumerQuotaLimits: []*sub.ConsumerQuotaLimit{ {Metric: "pony", QuotaBuckets: buckets}}, }, - }) + }, &up) if err != nil { t.Errorf("got unexpected error: %s", err) @@ -201,3 +205,78 @@ func TestUsageProviderGet(t *testing.T) { }) } } + +func TestParseResourceRequirementsInputs(t *testing.T) { + type test struct { + yml string + want rrInputs + err bool + } + tests := []test{ + {`# empty +requirements: []`, rrInputs{Requirements: []ResourceRequirement{}}, false}, + {`# complete +ignore_usage: true +requirements: +- metric: pony.api/friendship + consumer: redhat + service: zebra.api + required: 22 + dimensions: {"x": "y", "left": "right"} + aggregation: "SUM"`, rrInputs{ + IgnoreUsage: true, + Requirements: []ResourceRequirement{ + { + Metric: "pony.api/friendship", + Consumer: "redhat", + Service: "zebra.api", + Required: 22, + Dimensions: map[string]string{ + "x": "y", + "left": "right", + }, + Aggregation: "SUM", + }, + }, + }, false}, + {`# fill in +requirements: +- metric: pony.api/friendship + required: 33`, rrInputs{ + IgnoreUsage: false, + Requirements: []ResourceRequirement{ + { + Metric: "pony.api/friendship", + Service: "pony.api", + Consumer: "projects/apple", + Required: 33, + Dimensions: map[string]string{ + "region": "narnia", + "zone": "narnia-51", + }, + Aggregation: "SUM", + }, + }, + }, false}, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("%s", tc.yml), func(t *testing.T) { + var in config.Dict + bp := config.Blueprint{} + bp.Vars. + Set("project_id", cty.StringVal("apple")). + Set("region", cty.StringVal("narnia")). + Set("zone", cty.StringVal("narnia-51")) + if err := yaml.Unmarshal([]byte(tc.yml), &in); err != nil { + t.Fatal("failed to unmarshal yaml") + } + rr, err := parseResourceRequirementsInputs(bp, in) + if (err == nil) == tc.err { + t.Fatalf("unexpected error: %v", err) + } + if diff := cmp.Diff(tc.want, rr); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/validators/validators.go b/pkg/validators/validators.go index 597e9732c4..26da80fba4 100644 --- a/pkg/validators/validators.go +++ b/pkg/validators/validators.go @@ -18,9 +18,11 @@ import ( "context" "errors" "fmt" - "log" + "hpc-toolkit/pkg/config" "strings" + "github.com/zclconf/go-cty/cty" + "golang.org/x/exp/maps" compute "google.golang.org/api/compute/v1" "google.golang.org/api/googleapi" "google.golang.org/api/option" @@ -35,51 +37,20 @@ const zoneInRegionError = "zone %s is not in region %s in project ID %s or your const computeDisabledError = "Compute Engine API has not been used in project" const computeDisabledMsg = "the Compute Engine API must be enabled in project %s to validate blueprint global variables" const serviceDisabledMsg = "the Service Usage API must be enabled in project %s to validate that all APIs needed by the blueprint are enabled" -const unusedModuleMsg = "module %s uses module %s, but matching setting and outputs were not found. This may be because the value is set explicitly or set by a prior used module" -const unusedModuleError = "One or more used modules could not have their settings and outputs linked." -const unusedDeploymentVariableMsg = "the deployment variable \"%s\" was not used in this blueprint" -const unusedDeploymentVariableError = "one or more deployment variables was not used by any modules" +const unusedModuleMsg = "module %q uses module %q, but matching setting and outputs were not found. This may be because the value is set explicitly or set by a prior used module" func handleClientError(e error) error { if strings.Contains(e.Error(), "could not find default credentials") { - log.Println("load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform") - return fmt.Errorf("could not find application default credentials") - + return hint( + fmt.Errorf("could not find application default credentials"), + "load application default credentials following instructions at https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/README.md#supplying-cloud-credentials-to-terraform") } return e } -// TestDeploymentVariablesNotUsed errors if there are any unused deployment -// variables and prints any to the output for the user -func TestDeploymentVariablesNotUsed(unusedVariables []string) error { - for _, v := range unusedVariables { - log.Printf(unusedDeploymentVariableMsg, v) - } - - if len(unusedVariables) > 0 { - return fmt.Errorf(unusedDeploymentVariableError) - } - - return nil -} - -// TestModuleNotUsed validates that all modules referenced in the "use" field -// of the blueprint are actually used, i.e. the outputs and settings are -// connected. -func TestModuleNotUsed(unusedModules map[string][]string) error { - any := false - for mod, unusedMods := range unusedModules { - for _, unusedMod := range unusedMods { - log.Printf(unusedModuleMsg, mod, unusedMod) - any = true - } - } - - if any { - return fmt.Errorf(unusedModuleError) - } - - return nil +// TODO: use HintError trait once its implemented +func hint(err error, h string) error { + return fmt.Errorf("%w\n%s", err, h) } // TestApisEnabled tests whether APIs are enabled in given project @@ -93,8 +64,7 @@ func TestApisEnabled(projectID string, requiredAPIs []string) error { s, err := serviceusage.NewService(ctx, option.WithQuotaProject(projectID)) if err != nil { - err = handleClientError(err) - return err + return handleClientError(err) } prefix := "projects/" + projectID @@ -115,8 +85,9 @@ func TestApisEnabled(projectID string, requiredAPIs []string) error { } switch reason { case "SERVICE_DISABLED": - log.Printf(enableAPImsg, "serviceusage.googleapis.com", projectID) - return fmt.Errorf(serviceDisabledMsg, projectID) + return hint( + fmt.Errorf(serviceDisabledMsg, projectID), + fmt.Sprintf(enableAPImsg, "serviceusage.googleapis.com", projectID)) case "SERVICE_CONFIG_NOT_FOUND_OR_PERMISSION_DENIED": return fmt.Errorf("service %s does not exist in project %s", metadata["services"], projectID) case "USER_PROJECT_DENIED": @@ -129,18 +100,15 @@ func TestApisEnabled(projectID string, requiredAPIs []string) error { } } - var errored bool + errs := config.Errors{} for _, service := range resp.Services { if service.State == "DISABLED" { - errored = true - log.Printf("%s: service is disabled in project %s", service.Config.Name, projectID) - log.Printf(enableAPImsg, service.Config.Name, projectID) + errs.Add(hint( + fmt.Errorf("%s: service is disabled in project %s", service.Config.Name, projectID), + fmt.Sprintf(enableAPImsg, service.Config.Name, projectID))) } } - if errored { - return fmt.Errorf("one or more required APIs are disabled in project %s, please enable them as instructed above", projectID) - } - return nil + return errs.OrNil() } // TestProjectExists whether projectID exists / is accessible with credentials @@ -154,10 +122,14 @@ func TestProjectExists(projectID string) error { _, err = s.Projects.Get(projectID).Fields().Do() if err != nil { if strings.Contains(err.Error(), computeDisabledError) { - log.Printf(computeDisabledMsg, projectID) - log.Printf(serviceDisabledMsg, projectID) - log.Printf(enableAPImsg, "serviceusage.googleapis.com", projectID) - return fmt.Errorf(enableAPImsg, "compute.googleapis.com", projectID) + errs := config.Errors{} + return errs. + Add(hint( + fmt.Errorf(computeDisabledMsg, projectID), + fmt.Sprintf(enableAPImsg, "serviceusage.googleapis.com", projectID))). + Add(hint( + fmt.Errorf(serviceDisabledMsg, projectID), + fmt.Sprintf(enableAPImsg, "serviceusage.googleapis.com", projectID))) } return fmt.Errorf(projectError, projectID) } @@ -233,3 +205,279 @@ func TestZoneInRegion(projectID string, zone string, region string) error { return nil } + +const ( + testApisEnabledName = "test_apis_enabled" + testProjectExistsName = "test_project_exists" + testRegionExistsName = "test_region_exists" + testZoneExistsName = "test_zone_exists" + testZoneInRegionName = "test_zone_in_region" + testModuleNotUsedName = "test_module_not_used" + testDeploymentVariableNotUsedName = "test_deployment_variable_not_used" + testResourceRequirementsName = "test_resource_requirements" +) + +func implementations() map[string]func(config.Blueprint, config.Dict) error { + return map[string]func(config.Blueprint, config.Dict) error{ + testApisEnabledName: testApisEnabled, + testProjectExistsName: testProjectExists, + testRegionExistsName: testRegionExists, + testZoneExistsName: testZoneExists, + testZoneInRegionName: testZoneInRegion, + testModuleNotUsedName: testModuleNotUsed, + testDeploymentVariableNotUsedName: testDeploymentVariableNotUsed, + testResourceRequirementsName: testResourceRequirements, + } +} + +// ValidatorError is an error wrapper for errors that occurred during validation +type ValidatorError struct { + Validator string + Err error +} + +func (e ValidatorError) Unwrap() error { + return e.Err +} + +func (e ValidatorError) Error() string { + return fmt.Sprintf("validator %q failed:\n%v", e.Validator, e.Err) +} + +// Execute runs all validators on the blueprint +func Execute(bp config.Blueprint) error { + if bp.ValidationLevel == config.ValidationIgnore { + return nil + } + impl := implementations() + errs := config.Errors{} + for iv, v := range validators(bp) { + p := config.Root.Validators.At(iv) + if v.Skip { + continue + } + + f, ok := impl[v.Validator] + if !ok { + errs.At(p.Validator, fmt.Errorf("unknown validator %q", v.Validator)) + continue + } + + inp, err := v.Inputs.Eval(bp) + if err != nil { + errs.At(p.Inputs, err) + continue + } + + if err := f(bp, inp); err != nil { + errs.Add(ValidatorError{v.Validator, err}) + // do not bother running further validators if project ID could not be found + if v.Validator == "test_project_exists" { + break + } + } + } + return errs.OrNil() +} + +func checkInputs(inputs config.Dict, required []string) error { + errs := config.Errors{} + for _, inp := range required { + if !inputs.Has(inp) { + errs.Add(fmt.Errorf("a required input %q was not provided", inp)) + } + } + + if errs.Any() { + return errs + } + + // ensure that no extra inputs were provided by comparing length + if len(required) != len(inputs.Items()) { + errStr := "only %v inputs %s should be provided" + return fmt.Errorf(errStr, len(required), required) + } + + return nil +} + +func testApisEnabled(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{}); err != nil { + return err + } + p, err := bp.ProjectID() + if err != nil { + return err + } + apis := map[string]bool{} + bp.WalkModules(func(m *config.Module) error { + for _, api := range m.InfoOrDie().RequiredApis { + apis[api] = true + } + return nil + }) + return TestApisEnabled(p, maps.Keys(apis)) +} + +func testProjectExists(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{"project_id"}); err != nil { + return err + } + m, err := inputsAsStrings(inputs) + if err != nil { + return err + } + return TestProjectExists(m["project_id"]) +} + +func testRegionExists(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{"project_id", "region"}); err != nil { + return err + } + m, err := inputsAsStrings(inputs) + if err != nil { + return err + } + return TestRegionExists(m["project_id"], m["region"]) +} + +func testZoneExists(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{"project_id", "zone"}); err != nil { + return err + } + m, err := inputsAsStrings(inputs) + if err != nil { + return err + } + return TestZoneExists(m["project_id"], m["zone"]) +} + +func testZoneInRegion(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{"project_id", "region", "zone"}); err != nil { + return err + } + m, err := inputsAsStrings(inputs) + if err != nil { + return err + } + return TestZoneInRegion(m["project_id"], m["zone"], m["region"]) +} + +func testModuleNotUsed(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{}); err != nil { + return err + } + errs := config.Errors{} + bp.WalkModules(func(m *config.Module) error { + for _, u := range m.ListUnusedModules() { + // TODO: add yaml position to the error + errs.Add(fmt.Errorf(unusedModuleMsg, m.ID, u)) + } + return nil + }) + return errs.OrNil() +} + +func testDeploymentVariableNotUsed(bp config.Blueprint, inputs config.Dict) error { + if err := checkInputs(inputs, []string{}); err != nil { + return err + } + errs := config.Errors{} + for _, v := range bp.ListUnusedVariables() { + errs.At( + config.Root.Vars.Dot(v), + fmt.Errorf("the variable %q was not used in this blueprint", v)) + } + return errs.OrNil() +} + +// Helper function to sure that all input values are strings. +func inputsAsStrings(inputs config.Dict) (map[string]string, error) { + ms := map[string]string{} + for k, v := range inputs.Items() { + if v.Type() != cty.String { + return nil, fmt.Errorf("validator inputs must be strings, %s is a %s", k, v.Type()) + } + ms[k] = v.AsString() + } + return ms, nil +} + +// Creates a list of default validators for the given blueprint, +// inspect the blueprint for global variables that exist and add an appropriate validators. +func defaults(bp config.Blueprint) []config.Validator { + projectIDExists := bp.Vars.Has("project_id") + projectRef := config.GlobalRef("project_id").AsExpression().AsValue() + + regionExists := bp.Vars.Has("region") + regionRef := config.GlobalRef("region").AsExpression().AsValue() + + zoneExists := bp.Vars.Has("zone") + zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + + defaults := []config.Validator{ + {Validator: testModuleNotUsedName}, + {Validator: testDeploymentVariableNotUsedName}} + + // always add the project ID validator before subsequent validators that can + // only succeed if credentials can access the project. If the project ID + // validator fails, all remaining validators are not executed. + if projectIDExists { + defaults = append(defaults, config.Validator{ + Validator: testProjectExistsName, + Inputs: config.NewDict(map[string]cty.Value{"project_id": projectRef}), + }) + } + + // it is safe to run this validator even if vars.project_id is undefined; + // it will likely fail but will do so helpfully to the user + defaults = append(defaults, + config.Validator{Validator: testApisEnabledName}) + + if projectIDExists && regionExists { + defaults = append(defaults, config.Validator{ + Validator: testRegionExistsName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "region": regionRef, + }, + )}) + } + + if projectIDExists && zoneExists { + defaults = append(defaults, config.Validator{ + Validator: testZoneExistsName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "zone": zoneRef, + }), + }) + } + + if projectIDExists && regionExists && zoneExists { + defaults = append(defaults, config.Validator{ + Validator: testZoneInRegionName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "region": regionRef, + "zone": zoneRef, + }), + }) + } + return defaults +} + +// Returns a list of validators for the given blueprint with any default validators appended. +func validators(bp config.Blueprint) []config.Validator { + used := map[string]bool{} + for _, v := range bp.Validators { + used[v.Validator] = true + } + vs := append([]config.Validator{}, bp.Validators...) // clone + for _, v := range defaults(bp) { + if !used[v.Validator] { + vs = append(vs, v) + } + } + return vs +} diff --git a/pkg/validators/validators_test.go b/pkg/validators/validators_test.go new file mode 100644 index 0000000000..d9389f19a6 --- /dev/null +++ b/pkg/validators/validators_test.go @@ -0,0 +1,143 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validators + +import ( + "hpc-toolkit/pkg/config" + "testing" + + "github.com/zclconf/go-cty/cty" + . "gopkg.in/check.v1" +) + +// Setup GoCheck +type MySuite struct{} + +var _ = Suite(&MySuite{}) + +func Test(t *testing.T) { + TestingT(t) +} + +func (s *MySuite) TestCheckInputs(c *C) { + { // OK: Inputs is equal to required inputs without regard to ordering + i := config.NewDict(map[string]cty.Value{ + "in0": cty.NilVal, + "in1": cty.NilVal}) + c.Check(checkInputs(i, []string{"in0", "in1"}), IsNil) + c.Check(checkInputs(i, []string{"in1", "in0"}), IsNil) + } + + { // FAIL: inputs are a proper subset of required inputs + i := config.NewDict(map[string]cty.Value{ + "in0": cty.NilVal, + "in1": cty.NilVal}) + err := checkInputs(i, []string{"in0", "in1", "in2"}) + c.Check(err, NotNil) + } + + { // FAIL: inputs intersect with required inputs but are not a proper subset + i := config.NewDict(map[string]cty.Value{ + "in0": cty.NilVal, + "in1": cty.NilVal, + "in3": cty.NilVal}) + err := checkInputs(i, []string{"in0", "in1", "in2"}) + c.Check(err, NotNil) + } + + { // FAIL inputs are a proper superset of required inputs + i := config.NewDict(map[string]cty.Value{ + "in0": cty.NilVal, + "in1": cty.NilVal, + "in2": cty.NilVal, + "in3": cty.NilVal}) + err := checkInputs(i, []string{"in0", "in1", "in2"}) + c.Check(err, ErrorMatches, "only 3 inputs \\[in0 in1 in2\\] should be provided") + } +} + +func (s *MySuite) TestDefaultValidators(c *C) { + unusedMods := config.Validator{Validator: "test_module_not_used"} + unusedVars := config.Validator{Validator: "test_deployment_variable_not_used"} + apisEnabled := config.Validator{Validator: "test_apis_enabled"} + + projectRef := config.GlobalRef("project_id").AsExpression().AsValue() + regionRef := config.GlobalRef("region").AsExpression().AsValue() + zoneRef := config.GlobalRef("zone").AsExpression().AsValue() + + projectExists := config.Validator{ + Validator: testProjectExistsName, + Inputs: config.NewDict(map[string]cty.Value{"project_id": projectRef})} + regionExists := config.Validator{ + Validator: testRegionExistsName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "region": regionRef})} + zoneExists := config.Validator{ + Validator: testZoneExistsName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "zone": zoneRef})} + zoneInRegion := config.Validator{ + Validator: testZoneInRegionName, + Inputs: config.NewDict(map[string]cty.Value{ + "project_id": projectRef, + "region": regionRef, + "zone": zoneRef})} + + { + bp := config.Blueprint{} + c.Check(defaults(bp), DeepEquals, []config.Validator{ + unusedMods, unusedVars, apisEnabled}) + } + + { + bp := config.Blueprint{} + bp.Vars.Set("project_id", cty.StringVal("f00b")) + c.Check(defaults(bp), DeepEquals, []config.Validator{ + unusedMods, unusedVars, projectExists, apisEnabled}) + } + + { + bp := config.Blueprint{} + bp.Vars. + Set("project_id", cty.StringVal("f00b")). + Set("region", cty.StringVal("narnia")) + + c.Check(defaults(bp), DeepEquals, []config.Validator{ + unusedMods, unusedVars, projectExists, apisEnabled, regionExists}) + } + + { + bp := config.Blueprint{} + bp.Vars. + Set("project_id", cty.StringVal("f00b")). + Set("zone", cty.StringVal("danger")) + + c.Check(defaults(bp), DeepEquals, []config.Validator{ + unusedMods, unusedVars, projectExists, apisEnabled, zoneExists}) + } + + { + bp := config.Blueprint{} + bp.Vars. + Set("project_id", cty.StringVal("f00b")). + Set("region", cty.StringVal("narnia")). + Set("zone", cty.StringVal("danger")) + + c.Check(defaults(bp), DeepEquals, []config.Validator{ + unusedMods, unusedVars, projectExists, apisEnabled, regionExists, zoneExists, zoneInRegion}) + } +} diff --git a/tools/cloud-build/Dockerfile b/tools/cloud-build/Dockerfile index cafb8d027e..6104217f3e 100644 --- a/tools/cloud-build/Dockerfile +++ b/tools/cloud-build/Dockerfile @@ -50,7 +50,7 @@ WORKDIR /ghpc-tmp COPY ./ ./ RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt && \ + pip install --no-cache-dir -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.6/scripts/requirements.txt && \ pip install --no-cache-dir -r tools/cloud-build/requirements.txt && \ rm -rf ~/.cache/pip/* diff --git a/tools/cloud-build/babysit_tests.py b/tools/cloud-build/babysit_tests.py index 0623ddee67..38141aacd6 100755 --- a/tools/cloud-build/babysit_tests.py +++ b/tools/cloud-build/babysit_tests.py @@ -22,7 +22,7 @@ DESCRIPTION = """ babysit_tests is a tool to approve & retry CloudBuild tests. -It monitors status of builds referenced by PR commit SHA, +It monitors status of builds referenced by PR commit SHA, it will approve and retry tests accoding to configured concurrency and retry policies. The tool will terminate itself once there is no more actions to take or no reasons to wait for status changes. The subset of tests to monitor can be configured by using test_selectors, e.g. "all", exact_name_of_test. @@ -65,11 +65,16 @@ def selector(build: Build) -> bool: "PR-test-slurm-gcp-v5-startup-scripts", "PR-test-slurm-gcp-v5-ubuntu2004", "PR-test-hpc-enterprise-slurm", + "PR-test-hpc-slurm-chromedesktop", + "PR-test-lustre-slurm", ]), "spack": selector_by_name([ "PR-test-batch-mpi", "PR-test-spack-gromacs", ]), + "vm": selector_by_name([ + "PR-test-lustre-vm", + ]), } diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index 0c424bb20d..c684defbf0 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -184,6 +184,13 @@ ## Always cleanup, even on failure always: + - name: Get partition info after test + ansible.builtin.command: sinfo + changed_when: False + register: partition_post_run_output + - name: Print Slurm sinfo output + ansible.builtin.debug: + var: partition_post_run_output.stdout_lines - name: Recover Setup Log become: true changed_when: false diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-crd.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-crd.yml new file mode 100644 index 0000000000..808fd12484 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-crd.yml @@ -0,0 +1,75 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Assert variables are defined + ansible.builtin.assert: + that: + - custom_vars.partitions is defined + +- name: Get partition info + ansible.builtin.command: sinfo --format='%P' --noheader + changed_when: False + register: partition_output + retries: 10 + delay: 12 + until: partition_output.rc == 0 + +- name: Count Slurm nodes + ansible.builtin.shell: + sinfo -t 'IDLE&POWERED_DOWN' --noheader --format "%n" + args: + executable: /bin/bash + changed_when: False + register: initial_node_count + +- name: Check partitions exist + ansible.builtin.fail: + msg: Test Check Partitions failed + when: item not in partition_output.stdout + loop: "{{ custom_vars.partitions }}" + +- name: Test chrome remote desktop installed + register: crd_status + failed_when: crd_status.rc != 0 + ansible.builtin.command: | + srun -N 1 -p desktop bash -c ' + disable_sleep_yml=/usr/local/ghpc/disable-sleep.yml + retries=20 + retry_interval=10 + attempt=1 + while [ "$attempt" -le "$retries" ]; do + if [ -f "$disable_sleep_yml" ]; then + echo "file found" + exit 0 + else + echo "file not found, retrying..." + sleep $retry_interval + attempt=$((attempt+1)) + fi + done + echo "file not found, retry exhausted" + exit 1' + +- name: Print debug output + ansible.builtin.debug: + var: crd_status.stdout_lines + +- name: Ensure all nodes are powered down + ansible.builtin.command: sinfo -t 'IDLE&POWERED_DOWN' --noheader --format "%n" + register: final_node_count + changed_when: False + until: final_node_count.stdout_lines | length == initial_node_count.stdout_lines | length + retries: 40 + delay: 15 diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-slurm.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-slurm.yml new file mode 100644 index 0000000000..a5eea0bf0a --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-slurm.yml @@ -0,0 +1,39 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Create output folder + ansible.builtin.shell: | + [[ -d {{ custom_vars.output_dir }} ]] || sudo mkdir -m a+w {{ custom_vars.output_dir }} + args: + executable: /bin/bash +- name: Write file for Slurm machines + with_items: "{{ custom_vars.partitions }}" + ansible.builtin.shell: | + srun -N 1 -p '{{ item }}' bash -c 'echo {{ item }} > {{ custom_vars.output_dir }}/{{ item }}.txt' + args: + executable: /bin/bash + register: write_file_output + +- name: Print console output + ansible.builtin.debug: + msg: "{{ write_file_output }}" + +- name: Ensure a file exists for each partition + with_items: "{{ custom_vars.partitions }}" + ansible.builtin.shell: | + test $(cat {{ custom_vars.output_dir }}/{{ item }}.txt) == '{{ item }}' + args: + executable: /bin/bash diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-vm.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-vm.yml new file mode 100644 index 0000000000..2778229966 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-lustre-vm.yml @@ -0,0 +1,21 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Ensure a file exists for each OS type + with_items: "{{ custom_vars.vm_os_types }}" + ansible.builtin.shell: | + test $(cat {{ custom_vars.output_dir }}/{{ item }}.txt) == '{{ item }}' + args: + executable: /bin/bash diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-partitions.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-partitions.yml index afb158c4e7..ef592b6471 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-partitions.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-partitions.yml @@ -18,7 +18,21 @@ that: - custom_vars.partitions is defined - custom_vars.mounts is defined - +- name: Ensure all partitions are UP + # Find all unavailable partitions + # `| cat` is needed to ignore rc==1 by grep if no lines were selected. + # TODO: Make it "ansible-native" way, options: + # *`failed_when: false` + # *`failed_when: partitions_down.rc == 2` + # *`until: partitions_down.rc == 1` + ansible.builtin.shell: sinfo --noheader --all --format "%P[%a]" | grep -v "\[up\]" | cat + register: partitions_down + retries: 10 + delay: 12 + until: partitions_down.stdout_lines | length == 0 +- name: Print partitions down + ansible.builtin.debug: + var: partitions_down.stdout_lines - name: Get partition info ansible.builtin.command: sinfo --format='%P' --noheader changed_when: False @@ -38,11 +52,10 @@ msg: Test Check Partitions failed when: item not in partition_output.stdout loop: "{{ custom_vars.partitions }}" - - name: Test partition mounts, multi-node creation, & placement (when on) register: srun_mounts changed_when: srun_mounts.rc == 0 - ansible.builtin.command: "srun -N 2 -p {{ item }} mount" + ansible.builtin.command: "srun -N {{ custom_vars.num_slurm_nodes | default(2) }} -p {{ item }} mount" loop: "{{ custom_vars.partitions }}" - name: Fail if partitions unmounted @@ -54,9 +67,9 @@ label: "{{ item[1] }}" - name: Ensure all nodes are powered down - ansible.builtin.command: sinfo -t 'IDLE&POWERED_DOWN' --noheader --format "%n" + ansible.builtin.command: sinfo -t 'POWERED_DOWN' --noheader --format "%n" register: final_node_count changed_when: False until: final_node_count.stdout_lines | length == initial_node_count.stdout_lines | length - retries: 40 + retries: 60 delay: 15 diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml new file mode 100644 index 0000000000..5facc181fe --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -0,0 +1,156 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: test-slurm-lustre + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: test-slurm-lustre + region: us-central1 + zone: us-central1-a + machine_type: c3-standard-8 + disk_type: pd-ssd + # enable_placement: false + # on_host_maintenance: MIGRATE + num_nodes: 1 + +deployment_groups: +- group: primary + modules: + + ########### + # Network # + ########### + + # Source is an embedded resource, denoted by "resources/*" without ./, ../, / + # as a prefix. To refer to a local resource, prefix with ./, ../ or / + # Example - ./resources/network/vpc + - id: network1 + source: modules/network/pre-existing-vpc + + ########### + # Storage # + ########### + + # This file system has an associated license cost. + # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud + - id: lustre + source: community/modules/file-system/DDN-EXAScaler + use: [network1] + settings: + local_mount: /lustre + waiter: deploymentmanager + mgs: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 1 + node_cpu: "Intel Cascade Lake" + public_ip: true + mds: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 1 + node_cpu: "Intel Cascade Lake" + public_ip: true + oss: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 3 + node_cpu: "Intel Cascade Lake" + public_ip: true + + ############# + # Slurm VMs # + ############# + + # # Ubuntu 20.04 LTS + # - id: ubuntu_node_group + # source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + # settings: + # node_count_dynamic_max: $(vars.num_nodes) + # instance_image: + # family: slurm-gcp-5-7-ubuntu-2004-lts + # project: schedmd-slurm-public + + # - id: ubuntu_partition + # source: community/modules/compute/schedmd-slurm-gcp-v5-partition + # use: + # - network1 + # - ubuntu_node_group + # - lustre + # settings: + # partition_name: ubuntu + + # Rocky Linux 8 + - id: rocky_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: $(vars.num_nodes) + instance_image: + family: slurm-gcp-5-7-hpc-rocky-linux-8 + project: schedmd-slurm-public + + - id: rocky_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - rocky_node_group + - lustre + settings: + partition_name: rocky + + # CentOS 7 + - id: centos_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: $(vars.num_nodes) + instance_image: + family: slurm-gcp-5-7-hpc-centos-7 + project: schedmd-slurm-public + + - id: centos_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - centos_node_group + - lustre + settings: + partition_name: centos + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + # - ubuntu_partition + - rocky_partition + - centos_partition + - lustre + settings: + disable_controller_public_ips: false + # cloud_parameters: + # no_comma_params: false + # resume_rate: 0 + # resume_timeout: 1200 + # suspend_rate: 0 + # suspend_timeout: 1200 + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + settings: + disable_login_public_ips: false diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml new file mode 100644 index 0000000000..f0de5de8b7 --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml @@ -0,0 +1,147 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: test-workstation-lustre + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: test-workstation-lustre + region: us-central1 + zone: us-central1-a + machine_type: c3-standard-8 + disk_type: pd-ssd + instance_count: 1 + +deployment_groups: +- group: primary + modules: + + ########### + # Network # + ########### + + # Source is an embedded resource, denoted by "resources/*" without ./, ../, / + # as a prefix. To refer to a local resource, prefix with ./, ../ or / + # Example - ./resources/network/vpc + - id: network1 + source: modules/network/pre-existing-vpc + + ########### + # Storage # + ########### + + # This file system has an associated license cost. + # https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud + - id: lustre + source: community/modules/file-system/DDN-EXAScaler + use: [network1] + settings: + local_mount: /lustre + waiter: deploymentmanager + mgs: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 1 + node_cpu: "Intel Cascade Lake" + public_ip: true + mds: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 1 + node_cpu: "Intel Cascade Lake" + public_ip: true + oss: + nic_type: "GVNIC" + node_type: n2-standard-2 + node_count: 3 + node_cpu: "Intel Cascade Lake" + public_ip: true + + ################### + # Startup Scripts # + ################### + + - id: startup-script + source: modules/scripts/startup-script + settings: + install_ansible: true + runners: + - type: shell + destination: startup.sh + content: | + #!/bin/bash + set -ex + + output_dir=/lustre/test + [[ ! -d $output_dir ]] && mkdir -m a+w $output_dir + + os_name=\$(sed -nr 's/^ID="?([^"]+)"?/\1/p' /etc/os-release) + echo $os_name > ${output_dir}/${os_name}.txt + + ############# + # Simple VM # + ############# + + - id: workstation-centos + source: modules/compute/vm-instance + use: + - network1 + - lustre + - startup-script + settings: + name_prefix: centos + instance_image: + family: centos-7 + project: centos-cloud + - id: wait-centos + source: community/modules/scripts/wait-for-startup + settings: + instance_name: ((module.workstation-centos.name[0])) + timeout: 7200 + + - id: workstation-rocky + source: modules/compute/vm-instance + use: + - network1 + - lustre + - startup-script + settings: + name_prefix: rocky + instance_image: + family: rocky-linux-8 + project: rocky-linux-cloud + - id: wait-rocky + source: community/modules/scripts/wait-for-startup + settings: + instance_name: ((module.workstation-rocky.name[0])) + timeout: 7200 + + # - id: workstation-ubuntu + # source: modules/compute/vm-instance + # use: + # - network1 + # - lustre + # - startup-script + # settings: + # name_prefix: ubuntu + # instance_image: + # family: ubuntu-2004-lts + # project: ubuntu-os-cloud + # - id: wait-ubuntu + # source: community/modules/scripts/wait-for-startup + # settings: + # instance_name: ((module.workstation-ubuntu.name[0])) + # timeout: 7200 diff --git a/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop.yaml b/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop.yaml new file mode 100644 index 0000000000..e26b4a4d09 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/hpc-slurm-chromedesktop.yaml @@ -0,0 +1,54 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" +## Test community/examples/hpc-slurm-chromedesktop.yaml +- id: hpc-slurm-chromedesktop + waitFor: ["fetch_builder", "build_ghpc"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml" diff --git a/tools/cloud-build/daily-tests/builds/lustre-slurm.yaml b/tools/cloud-build/daily-tests/builds/lustre-slurm.yaml new file mode 100644 index 0000000000..2b8f27f5d8 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/lustre-slurm.yaml @@ -0,0 +1,54 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" +## Test the blueprint +- id: lustre-slurm + waitFor: ["fetch_builder", "build_ghpc"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/lustre-slurm.yml" diff --git a/tools/cloud-build/daily-tests/builds/lustre-vm.yaml b/tools/cloud-build/daily-tests/builds/lustre-vm.yaml new file mode 100644 index 0000000000..f6d4910c71 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/lustre-vm.yaml @@ -0,0 +1,54 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +## Test lustre-vm +- id: lustre-vm + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/lustre-vm.yml" diff --git a/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml b/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml new file mode 100644 index 0000000000..ccb272da45 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/hpc-slurm-chromedesktop.yml @@ -0,0 +1,40 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +test_name: hpc-slurm-chromedesktop +deployment_name: "enter-{{ build }}" +# Manually adding the slurm_cluster_name for use in node names, which filters +# non-alphanumeric chars and is capped at 10 chars. +slurm_cluster_name: "enter{{ build[0:5] }}" +zone: europe-west1-d +cli_deployment_vars: + region: europe-west1 + zone: "{{ zone }}" +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-slurm-chromedesktop.yaml" +network: "{{ deployment_name }}-net" +max_nodes: 5 +# Note: Pattern matching in gcloud only supports 1 wildcard. +login_node: "{{ slurm_cluster_name }}-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-mounts.yml +- test-crd.yml +custom_vars: + mounts: + - /home + partitions: + - desktop + - compute diff --git a/tools/cloud-build/daily-tests/tests/lustre-slurm.yml b/tools/cloud-build/daily-tests/tests/lustre-slurm.yml new file mode 100644 index 0000000000..8237cc8a02 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/lustre-slurm.yml @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +test_name: lustre-vm +deployment_name: "lustr-{{ build }}" +region: us-central1 +zone: us-central1-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml" +network: "default" +slurm_cluster_name: "lustr{{ build[0:5] }}" +cli_deployment_vars: + region: "{{ region }}" + zone: "{{ zone }}" +# Note: Pattern matching in gcloud only supports 1 wildcard. +login_node: "{{ slurm_cluster_name }}-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-mounts.yml +- test-partitions.yml +- test-lustre-slurm.yml +custom_vars: + output_dir: /lustre/test + num_slurm_nodes: 1 + mounts: + - /lustre + partitions: + - centos + - rocky + # - ubuntu diff --git a/tools/cloud-build/daily-tests/tests/lustre-vm.yml b/tools/cloud-build/daily-tests/tests/lustre-vm.yml new file mode 100644 index 0000000000..bcdc857a0d --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/lustre-vm.yml @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: lustre-vm +deployment_name: "lustre-vm-{{ build }}" +zone: us-central1-a +workspace: /workspace +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/lustre-vm.yaml" +network: "default" +remote_node: "centos-0" +post_deploy_tests: +- test-mounts.yml +- test-lustre-vm.yml +custom_vars: + output_dir: /lustre/test + num_slurm_nodes: 1 + mounts: + - /lustre + vm_os_types: + - centos + - rocky + # - ubuntu diff --git a/tools/cloud-workstations/030_configure-hpc-toolkit.sh b/tools/cloud-workstations/030_configure-hpc-toolkit.sh index 18999c0c2a..46dab9de2f 100755 --- a/tools/cloud-workstations/030_configure-hpc-toolkit.sh +++ b/tools/cloud-workstations/030_configure-hpc-toolkit.sh @@ -13,4 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -sudo -i -u user bash /bin/configure-hpc-toolkit.sh +# The docker script moves this to /etc/workstation-startup.d to run +# the hpc toolkit configuration when the workstation is brought up for +# the first time. +sudo -i -b -u user bash /bin/configure-hpc-toolkit.sh diff --git a/tools/cloud-workstations/Dockerfile b/tools/cloud-workstations/Dockerfile index 144929223d..01c6f8b529 100644 --- a/tools/cloud-workstations/Dockerfile +++ b/tools/cloud-workstations/Dockerfile @@ -12,19 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. - - # Getting Terraform and Packer FROM us-central1-docker.pkg.dev/cloud-workstations-images/predefined/code-oss:latest ARG TFLINT_VERSION +# Need to get shellcheck directly as the on the repo is outdated and causes failures +ARG SHELLCHECK_VER=v0.9.0 RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ apt-get -y update && apt-get -y install \ software-properties-common \ keychain \ - dnsutils \ - shellcheck && \ + dnsutils && \ apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com bullseye main" && \ apt-get -y update && apt-get install -y unzip python3-pip python3-venv terraform packer jq && \ echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ @@ -34,9 +33,31 @@ RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ apt-get -y update && apt-get -y install google-cloud-sdk && \ apt-get clean && rm -rf /var/lib/apt/lists/* +RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.9.0/shellcheck-$SHELLCHECK_VER.linux.x86_64.tar.xz && \ + tar xvf shellcheck-$SHELLCHECK_VER.linux.x86_64.tar.xz && \ + mv shellcheck-$SHELLCHECK_VER/shellcheck /bin/shellcheck + RUN curl -s https://raw.githubusercontent.com/terraform-linters/tflint/master/install_linux.sh | bash +COPY tools/cloud-build/requirements.txt cloud_build_requirements.txt +COPY docs/hybrid-slurm-cluster/requirements.txt slurm_requirements.txt + RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/master/scripts/requirements.txt && \ - pip install --no-cache-dir -r https://raw.githubusercontent.com/GoogleCloudPlatform/hpc-toolkit/main/tools/cloud-build/requirements.txt && \ + pip install --no-cache-dir -r cloud_build_requirements.txt && \ + pip install --no-cache-dir -r slurm_requirements.txt && \ rm -rf ~/.cache/pip/* + +# Get the HPC config files and store them in the correct locations for startup +ARG CW_DIR=tools/cloud-workstations +ARG HPC_FILE=configure-hpc-toolkit.sh +ARG HPC_CONF_FILE=030_configure-hpc-toolkit.sh + +COPY $CW_DIR/$HPC_FILE /bin/ +COPY $CW_DIR/$HPC_CONF_FILE /etc/workstation-startup.d/ + +RUN chmod a+x /bin/$HPC_FILE && \ + chmod a+x /etc/workstation-startup.d/$HPC_CONF_FILE + +RUN mkdir /etc/hpc-toolkit-config +COPY $CW_DIR/code_oss_requirements.txt /etc/hpc-toolkit-config diff --git a/tools/cloud-workstations/README.md b/tools/cloud-workstations/README.md index 151a6075f1..80a4f54940 100644 --- a/tools/cloud-workstations/README.md +++ b/tools/cloud-workstations/README.md @@ -5,14 +5,26 @@ ## Create an artifact registry repository -The following will create a repository called `hpc-toolkit-workstation-image` in gcloud's default cloud project. +Set the variables to be used in the commands below. + +> **_NOTE:_** Replace the REGION value with a region that you want to host your workstations in. The CUSTOM_IMAGE location won't exist until the image is created, but it can still be set in advance. ```sh PROJECT_ID=$(gcloud config get project) LOCATION=us -REPO=hpc-toolkit-workstation-image +REGION= +WORKSTATION_NAME=hpc-toolkit-workstation +CLUSTER_NAME=$WORKSTATION_NAME-cluster +CONFIG_NAME=$WORKSTATION_NAME-config +CUSTOM_IMAGE=us-docker.pkg.dev/${PROJECT_ID}/${WORKSTATION_NAME}/hpc-toolkit-workstation:latest +MACHINE_TYPE=e2-standard-8 +SERVICE_ACCOUNT= +``` + +The following will create a repository called `hpc-toolkit-workstation-image` in gcloud's default cloud project. -gcloud artifacts repositories create ${REPO} --repository-format=docker --location=${LOCATION} --project=${PROJECT_ID} +```sh +gcloud artifacts repositories create ${WORKSTATION_NAME} --repository-format=docker --location=${LOCATION} --project=${PROJECT_ID} ``` ## Build a Cloud Workstation container with all developer dependencies for the HPC Toolkit @@ -20,10 +32,54 @@ gcloud artifacts repositories create ${REPO} --repository-format=docker --locati To build the Cloud workstation container as defined in the [Dockerfile](./Dockerfile), run the following command from the root of the HPC-Toolkit repo: ```sh -gcloud builds submit --config=tools/cloud-workstations/workstation-image.yaml --substitutions _LOCATION=${LOCATION},_REPO=${REPO} --project ${PROJECT_ID} +gcloud builds submit --config=tools/cloud-workstations/workstation-image.yaml --substitutions _LOCATION=${LOCATION},_REPO=${WORKSTATION_NAME} --project ${PROJECT_ID} ``` -## Create the Cloud Workstations Cluster and configuration +## Create the Cloud Workstations cluster and configuration Create a Google Cloud Workstations by following the instructions in https://cloud.google.com/workstations/docs/create-workstation. Make sure that during the "Create Configuration" phase, you click on the `Environment Configuration` and choose the custom container image we built above by clicking in the `SELECT` button. + +The examples below are cloud shell (`gcloud`) commands that utilize the environment variable set at the top of this document. + +### Example creation of a Cloud Workstation cluster + +```sh +gcloud workstations clusters create ${CLUSTER_NAME} --region=${REGION} --project=${PROJECT_ID} +``` + +> **_NOTE:_** If the workstation won't start and gives an error about the cluster being deprecated, you may need to enter the cloud console and update the service account to the default. + +### Example creation of a Cloud Workstation configuration + +This uses the latest docker image from the instructions above. If a different image is required, please replace the `--container-custom-image` with the correct image and hash/tag. + +> **_NOTE:_** Users should determine the service account to use with the command `gcloud iam service-accounts list`. + +```sh +gcloud workstations configs create ${CONFIG_NAME} --cluster=${CLUSTER_NAME} --region=${REGION} --project=${PROJECT_ID} --machine-type=${MACHINE_TYPE} --container-custom-image=${CUSTOM_IMAGE} --service-account=${SERVICE_ACCOUNT} +``` + +## Create the Cloud Workstation + +Once the Cloud Workstations cluster and configuration are built, workstations can be built. + +### Example creation of Cloud Workstation + +```sh +gcloud workstations create ${WORKSTATION_NAME} --cluster=${CLUSTER_NAME} --config=${CONFIG_NAME} --region=${REGION} +``` + +Once this is complete, the cloud console can be used to start and launch the workstation. + +## Using the Cloud Workstation + +Once built and upon initial launch (assuming no changes were made to the files used to build the workstation image), the workstation should have a clean version of the main branch HPC Toolkit in the user's home directory, as well as all of the prerequisites required to build and run pre-commit (see [Development](../../README.md#development)). + +The final setup steps are: + +* Updating up your git settings + * User name and email + * SSH keys for Github +* Cloning a forked repo + * Run `pre-commit install` in each new cloned repository to make sure that pre-commit is run during each commit diff --git a/tools/cloud-workstations/code_oss_requirements.txt b/tools/cloud-workstations/code_oss_requirements.txt new file mode 100644 index 0000000000..b5dc613d02 --- /dev/null +++ b/tools/cloud-workstations/code_oss_requirements.txt @@ -0,0 +1,10 @@ +# Relevant extensions list for HPC Toolkit for code-oss on cloud workstations + +golang.go +hashicorp.terraform +ms-python.python +ms-toolsai.jupyter +ms-toolsai.jupyter-keymap +ms-toolsai.jupyter-renderers +ms-toolsai.vscode-jupyter-cell-tags +ms-toolsai.vscode-jupyter-slideshow diff --git a/tools/cloud-workstations/configure-hpc-toolkit.sh b/tools/cloud-workstations/configure-hpc-toolkit.sh index e3f71b0b90..dec8d3cc6e 100755 --- a/tools/cloud-workstations/configure-hpc-toolkit.sh +++ b/tools/cloud-workstations/configure-hpc-toolkit.sh @@ -23,3 +23,18 @@ if [ ! -d "$HOME/hpc-toolkit" ]; then make install-dev-deps pre-commit install fi + +# Run only on initial bootup +FLAG="$HOME/.firstboot" +if [[ ! -f $FLAG ]]; then + # Set path for go binaries + echo "export PATH=$PATH:$HOME/go/bin" >>"$HOME"/.bashrc + + # Set up Code OSS for golang + grep -v '^#'