From ac3f769a508638a6f5774a28909e6204bb919010 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Fri, 31 May 2024 13:26:10 +0000 Subject: [PATCH 001/118] Update slurm-gcp module to use custom endpoints. Dependent on change to slurm-gcp PR#146 --- .../schedmd-slurm-gcp-v6-controller/README.md | 18 ++++++++++-------- .../slurm_files.tf | 6 +++++- .../variables.tf | 12 ++++++++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 622eaf5835..bef0372b4a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -195,14 +195,14 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.9 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.9 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.9 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 208c1ad | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.6 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.6 | ## Resources @@ -235,6 +235,7 @@ limitations under the License. | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | +| [custom\_endpoints](#input\_custom\_endpoints) | Alternate set of API endpoints | `map(string)` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | DEPRECATED: Use `enable_default_mounts` instead. | `bool` | `null` | no | @@ -289,6 +290,7 @@ limitations under the License. | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | `null` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | ## Outputs diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index cd0fc9a049..5bec37b990 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=208c1ad" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name @@ -130,4 +130,8 @@ module "slurm_files" { nodeset_dyn = [for ns in values(local.nodeset_dyn_map) : { nodeset : ns }] depends_on = [module.bucket] + + # Providers + universe_domain = var.universe_domain + custom_endpoints = var.custom_endpoints } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 1013ec1178..608616a8dc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -612,3 +612,15 @@ EOD type = any default = false } + +variable "universe_domain" { + description = "Domain address for alternate API universe" + type = string + default = null +} + +variable "custom_endpoints" { + description = "Alternate set of API endpoints" + type = map(string) + default = null +} From 306049707c31b2d952388f0b01e08ef2360a1695 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 13 Jun 2024 14:25:18 +0000 Subject: [PATCH 002/118] small update to change universe-endpoints --- .../schedmd-slurm-gcp-v6-controller/README.md | 18 +++++++++--------- .../controller.tf | 5 +++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 6 +++--- .../slurm_files.tf | 6 +++--- .../variables.tf | 14 ++++++++++---- 6 files changed, 30 insertions(+), 23 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index bef0372b4a..226fb396b5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -195,14 +195,14 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 208c1ad | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.6 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.6 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | fe3cc39 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | fe3cc39 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | fe3cc39 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | fe3cc39 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | fe3cc39 | ## Resources @@ -290,7 +290,7 @@ limitations under the License. | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | -| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | `null` | no | +| [universe\_information](#input\_universe\_information) | Domain address and credentials for alternate API universe |
object({
domain = string
credentials = string
})
|
{
"credentials": null,
"domain": null
}
| no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | ## Outputs diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 9a78d02a16..e8847f4751 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,7 +36,8 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" + count = local.have_template ? 0 : 1 project_id = var.project_id region = var.region @@ -92,7 +93,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=fe3cc39" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 89303f6212..90b30ae0ea 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -57,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=fe3cc39" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 32e6f3fa84..ae56b59b8a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local nodeset module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" for_each = local.nodeset_map project_id = var.project_id @@ -65,7 +65,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=fe3cc39" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -85,7 +85,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.9" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=fe3cc39" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 5bec37b990..587fdaa303 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=208c1ad" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=fe3cc39" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name @@ -132,6 +132,6 @@ module "slurm_files" { depends_on = [module.bucket] # Providers - universe_domain = var.universe_domain - custom_endpoints = var.custom_endpoints + universe_information = var.universe_information + custom_endpoints = var.custom_endpoints } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 608616a8dc..fdfbd0af42 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -613,10 +613,16 @@ EOD default = false } -variable "universe_domain" { - description = "Domain address for alternate API universe" - type = string - default = null +variable "universe_information" { + description = "Domain address and credentials for alternate API universe" + type = object({ + domain = string + credentials = string + }) + default = { + domain = null + credentials = null + } } variable "custom_endpoints" { From 2d2458e91b3a5c254bc15c3ef206e34164ad9823 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Jul 2024 23:24:57 +0000 Subject: [PATCH 003/118] Bump certifi from 2023.07.22 to 2024.7.4 in /community/front-end/ofe Bumps [certifi](https://github.com/certifi/python-certifi) from 2023.07.22 to 2024.7.4. - [Commits](https://github.com/certifi/python-certifi/compare/2023.07.22...2024.07.04) --- updated-dependencies: - dependency-name: certifi dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 45059632b8..6276400c28 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -7,7 +7,7 @@ attrs==23.1.0 # This should be supported by zoneinfo in Python 3.9+ backports.zoneinfo==0.2.1;python_version<"3.9" cachetools==5.3.1 -certifi==2023.07.22 +certifi==2024.7.4 cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 From 4ddba66069c8b1c9b722b49205cccf934c80e269 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 10:17:22 +0000 Subject: [PATCH 004/118] Bump golang.org/x/sys from 0.21.0 to 0.22.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.21.0 to 0.22.0. - [Commits](https://github.com/golang/sys/compare/v0.21.0...v0.22.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 7a7435fb98..f51ae6669a 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.24.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.21.0 + golang.org/x/sys v0.22.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index 0c9ffbf1dc..432cefb4fe 100644 --- a/go.sum +++ b/go.sum @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 26f10ad078beb7955b916f0ccf6a370b9b3be85c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jul 2024 10:17:35 +0000 Subject: [PATCH 005/118] Bump google.golang.org/api from 0.186.0 to 0.187.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.186.0 to 0.187.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.186.0...v0.187.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 10 +++++----- go.sum | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index 7a7435fb98..8b99f8e518 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect + google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,11 +27,11 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.186.0 + google.golang.org/api v0.187.0 ) require ( - cloud.google.com/go/auth v0.6.0 // indirect + cloud.google.com/go/auth v0.6.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect @@ -54,8 +54,8 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect ) require ( diff --git a/go.sum b/go.sum index 0c9ffbf1dc..e4707a8db7 100644 --- a/go.sum +++ b/go.sum @@ -46,8 +46,8 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.6.0 h1:5x+d6b5zdezZ7gmLWD1m/xNjnaQ2YDhmIz/HH3doy1g= -cloud.google.com/go/auth v0.6.0/go.mod h1:b4acV+jLQDyjwm4OXHYjNvRi4jvGBzHWJRtJcy+2P4g= +cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38= +cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4= cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.186.0 h1:n2OPp+PPXX0Axh4GuSsL5QL8xQCTb2oDwyzPnQvqUug= -google.golang.org/api v0.186.0/go.mod h1:hvRbBmgoje49RV3xqVXrmP6w93n6ehGgIVPYrGtBFFc= +google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo= +google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -978,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 h1:CUiCqkPw1nNrNQzCCG4WA65m0nAmQiwXHpub3dNyruU= -google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4/go.mod h1:EvuUDCulqGgV80RvP1BHuom+smhX4qtlhnNatHuroGQ= -google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 h1:QW9+G6Fir4VcRXVH8x3LilNAb6cxBGLa6+GM4hRwexE= -google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3/go.mod h1:kdrSS/OiLkPrNUpzD4aHgCq2rVuC/YRxok32HXZ4vRE= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 h1:Di6ANFilr+S60a4S61ZM00vLdw0IrQOSMS2/6mrnOU0= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls= +google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M= +google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc= +google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From b84e9030a492d7a26a5e19ad4e380c31e47f76f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:51:17 +0000 Subject: [PATCH 006/118] Bump zipp from 3.17.0 to 3.19.1 in /community/front-end/ofe Bumps [zipp](https://github.com/jaraco/zipp) from 3.17.0 to 3.19.1. - [Release notes](https://github.com/jaraco/zipp/releases) - [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/zipp/compare/v3.17.0...v3.19.1) --- updated-dependencies: - dependency-name: zipp dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 45059632b8..b5c971a3c6 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -98,4 +98,4 @@ virtualenv==20.23.1 wrapt==1.15.0 xmltodict==0.13.0 yq==3.2.2 -zipp==3.17.0 +zipp==3.19.1 From 2f874e69944364ead0990ee86355d9de347d74ff Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Tue, 25 Jun 2024 19:12:42 +0000 Subject: [PATCH 007/118] Update for testing out changes in slurm-gcp and updates to account for changes in slurm-gcp Also updated cleanup_compute to be compatible with different versions of gcloud Update to use slurm-gcp 6.5.10 --- .../schedmd-slurm-gcp-v6-controller/README.md | 20 ++++++++-------- .../controller.tf | 7 +++--- .../schedmd-slurm-gcp-v6-controller/login.tf | 6 ++--- .../schedmd-slurm-gcp-v6-controller/main.tf | 1 + .../partition.tf | 8 +++---- .../slurm_files.tf | 5 ++-- .../variables.tf | 24 +++++++++---------- 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 226fb396b5..d521253118 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -195,14 +195,14 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | fe3cc39 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | fe3cc39 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | fe3cc39 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | fe3cc39 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | fe3cc39 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | fe3cc39 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.10 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.10 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.10 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.10 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.10 | ## Resources @@ -235,7 +235,6 @@ limitations under the License. | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | -| [custom\_endpoints](#input\_custom\_endpoints) | Alternate set of API endpoints | `map(string)` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | DEPRECATED: Use `enable_default_mounts` instead. | `bool` | `null` | no | @@ -256,6 +255,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": "beta"
}
| no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | @@ -290,7 +290,7 @@ limitations under the License. | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | -| [universe\_information](#input\_universe\_information) | Domain address and credentials for alternate API universe |
object({
domain = string
credentials = string
})
|
{
"credentials": null,
"domain": null
}
| no | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | `"googleapis.com"` | no | | [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | ## Outputs diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index e8847f4751..2041824ec7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,8 +36,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" - count = local.have_template ? 0 : 1 + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" project_id = var.project_id region = var.region @@ -64,7 +63,7 @@ module "slurm_controller_template" { gpu = one(local.guest_accelerator) machine_type = var.machine_type - metadata = var.metadata + metadata = merge(var.metadata, local.universe_domain) min_cpu_platform = var.min_cpu_platform # network_ip = TODO: add support for network_ip @@ -93,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.10" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 90b30ae0ea..19ce7c8c0e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -39,7 +39,7 @@ module "slurm_login_template" { gpu = each.value.gpu labels = each.value.labels machine_type = each.value.machine_type - metadata = each.value.metadata + metadata = merge(each.value.metadata, local.universe_domain) min_cpu_platform = each.value.min_cpu_platform on_host_maintenance = each.value.on_host_maintenance preemptible = each.value.preemptible @@ -57,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.10" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf index 5108c295ee..095d4efdbb 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf @@ -26,6 +26,7 @@ locals { slurm_cluster_name = coalesce(var.slurm_cluster_name, local.tmp_cluster_name) files_cs_labels = { slurm_files_checksum = module.slurm_files.checksum } + universe_domain = { "universe_domain" = var.universe_domain } } data "google_compute_default_service_account" "default" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index ae56b59b8a..33ffad56d9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local nodeset module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" for_each = local.nodeset_map project_id = var.project_id @@ -48,7 +48,7 @@ module "slurm_nodeset_template" { gpu = each.value.gpu labels = each.value.labels machine_type = each.value.machine_type - metadata = each.value.metadata + metadata = merge(each.value.metadata, local.universe_domain) min_cpu_platform = each.value.min_cpu_platform name_prefix = each.value.nodeset_name on_host_maintenance = each.value.on_host_maintenance @@ -65,7 +65,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.10" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -85,7 +85,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.10" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 587fdaa303..17c30fe727 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=fe3cc39" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.10" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name @@ -132,6 +132,5 @@ module "slurm_files" { depends_on = [module.bucket] # Providers - universe_information = var.universe_information - custom_endpoints = var.custom_endpoints + endpoint_versions = var.endpoint_versions } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index fdfbd0af42..7d4342dc7e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -613,20 +613,20 @@ EOD default = false } -variable "universe_information" { - description = "Domain address and credentials for alternate API universe" +variable "universe_domain" { + description = "Domain address for alternate API universe" + type = string + default = "googleapis.com" + nullable = false +} + +variable "endpoint_versions" { + description = "Version of the API to use (The compute service is the only API currently supported)" type = object({ - domain = string - credentials = string + compute = string }) default = { - domain = null - credentials = null + compute = "beta" } -} - -variable "custom_endpoints" { - description = "Alternate set of API endpoints" - type = map(string) - default = null + nullable = false } From 445ca5cf0572417e216181459b065fb47c770130 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Mon, 8 Jul 2024 19:14:58 +0000 Subject: [PATCH 008/118] Update slurm-gcp modules to 6.5.13 to take advantage of TPU and universe domain fix --- .../schedmd-slurm-gcp-v6-controller/README.md | 16 ++++++++-------- .../controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 6 +++--- .../slurm_files.tf | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index d521253118..98942f9163 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -195,14 +195,14 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.10 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.10 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.10 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.10 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.10 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.10 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.13 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.13 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.13 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 2041824ec7..e0dae4e02d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,7 +36,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" project_id = var.project_id region = var.region @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.13" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 19ce7c8c0e..3e1d980c70 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -57,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.13" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 33ffad56d9..11de8a21cd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local nodeset module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" for_each = local.nodeset_map project_id = var.project_id @@ -65,7 +65,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.13" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -85,7 +85,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.13" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 17c30fe727..c8e7bd0213 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -87,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.10" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.13" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name From 4bc9ee9833c2923a59a635c3eb8abf5b6bdaedb6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Jul 2024 22:10:55 +0000 Subject: [PATCH 009/118] Bump django from 4.2.11 to 4.2.14 in /community/front-end/ofe Bumps [django](https://github.com/django/django) from 4.2.11 to 4.2.14. - [Commits](https://github.com/django/django/compare/4.2.11...4.2.14) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 45059632b8..488f21d9a5 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.11 +Django==4.2.14 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.14.0 From 710f0b3753d8cab6c89714715b9ebc8bf57ddbf8 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 12 Jul 2024 19:15:23 +0000 Subject: [PATCH 010/118] move gke modules from community to core --- {community/modules => modules}/compute/gke-job-template/README.md | 0 {community/modules => modules}/compute/gke-job-template/main.tf | 0 .../modules => modules}/compute/gke-job-template/metadata.yaml | 0 .../modules => modules}/compute/gke-job-template/outputs.tf | 0 .../compute/gke-job-template/templates/gke-job-base.yaml.tftpl | 0 .../modules => modules}/compute/gke-job-template/variables.tf | 0 .../modules => modules}/compute/gke-job-template/versions.tf | 0 {community/modules => modules}/compute/gke-node-pool/README.md | 0 {community/modules => modules}/compute/gke-node-pool/main.tf | 0 .../modules => modules}/compute/gke-node-pool/metadata.yaml | 0 {community/modules => modules}/compute/gke-node-pool/outputs.tf | 0 .../compute/gke-node-pool/threads_per_core_calc.tf | 0 {community/modules => modules}/compute/gke-node-pool/variables.tf | 0 {community/modules => modules}/compute/gke-node-pool/versions.tf | 0 {community/modules => modules}/scheduler/gke-cluster/README.md | 0 {community/modules => modules}/scheduler/gke-cluster/main.tf | 0 .../modules => modules}/scheduler/gke-cluster/metadata.yaml | 0 {community/modules => modules}/scheduler/gke-cluster/outputs.tf | 0 {community/modules => modules}/scheduler/gke-cluster/variables.tf | 0 {community/modules => modules}/scheduler/gke-cluster/versions.tf | 0 20 files changed, 0 insertions(+), 0 deletions(-) rename {community/modules => modules}/compute/gke-job-template/README.md (100%) rename {community/modules => modules}/compute/gke-job-template/main.tf (100%) rename {community/modules => modules}/compute/gke-job-template/metadata.yaml (100%) rename {community/modules => modules}/compute/gke-job-template/outputs.tf (100%) rename {community/modules => modules}/compute/gke-job-template/templates/gke-job-base.yaml.tftpl (100%) rename {community/modules => modules}/compute/gke-job-template/variables.tf (100%) rename {community/modules => modules}/compute/gke-job-template/versions.tf (100%) rename {community/modules => modules}/compute/gke-node-pool/README.md (100%) rename {community/modules => modules}/compute/gke-node-pool/main.tf (100%) rename {community/modules => modules}/compute/gke-node-pool/metadata.yaml (100%) rename {community/modules => modules}/compute/gke-node-pool/outputs.tf (100%) rename {community/modules => modules}/compute/gke-node-pool/threads_per_core_calc.tf (100%) rename {community/modules => modules}/compute/gke-node-pool/variables.tf (100%) rename {community/modules => modules}/compute/gke-node-pool/versions.tf (100%) rename {community/modules => modules}/scheduler/gke-cluster/README.md (100%) rename {community/modules => modules}/scheduler/gke-cluster/main.tf (100%) rename {community/modules => modules}/scheduler/gke-cluster/metadata.yaml (100%) rename {community/modules => modules}/scheduler/gke-cluster/outputs.tf (100%) rename {community/modules => modules}/scheduler/gke-cluster/variables.tf (100%) rename {community/modules => modules}/scheduler/gke-cluster/versions.tf (100%) diff --git a/community/modules/compute/gke-job-template/README.md b/modules/compute/gke-job-template/README.md similarity index 100% rename from community/modules/compute/gke-job-template/README.md rename to modules/compute/gke-job-template/README.md diff --git a/community/modules/compute/gke-job-template/main.tf b/modules/compute/gke-job-template/main.tf similarity index 100% rename from community/modules/compute/gke-job-template/main.tf rename to modules/compute/gke-job-template/main.tf diff --git a/community/modules/compute/gke-job-template/metadata.yaml b/modules/compute/gke-job-template/metadata.yaml similarity index 100% rename from community/modules/compute/gke-job-template/metadata.yaml rename to modules/compute/gke-job-template/metadata.yaml diff --git a/community/modules/compute/gke-job-template/outputs.tf b/modules/compute/gke-job-template/outputs.tf similarity index 100% rename from community/modules/compute/gke-job-template/outputs.tf rename to modules/compute/gke-job-template/outputs.tf diff --git a/community/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl similarity index 100% rename from community/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl rename to modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl diff --git a/community/modules/compute/gke-job-template/variables.tf b/modules/compute/gke-job-template/variables.tf similarity index 100% rename from community/modules/compute/gke-job-template/variables.tf rename to modules/compute/gke-job-template/variables.tf diff --git a/community/modules/compute/gke-job-template/versions.tf b/modules/compute/gke-job-template/versions.tf similarity index 100% rename from community/modules/compute/gke-job-template/versions.tf rename to modules/compute/gke-job-template/versions.tf diff --git a/community/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md similarity index 100% rename from community/modules/compute/gke-node-pool/README.md rename to modules/compute/gke-node-pool/README.md diff --git a/community/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf similarity index 100% rename from community/modules/compute/gke-node-pool/main.tf rename to modules/compute/gke-node-pool/main.tf diff --git a/community/modules/compute/gke-node-pool/metadata.yaml b/modules/compute/gke-node-pool/metadata.yaml similarity index 100% rename from community/modules/compute/gke-node-pool/metadata.yaml rename to modules/compute/gke-node-pool/metadata.yaml diff --git a/community/modules/compute/gke-node-pool/outputs.tf b/modules/compute/gke-node-pool/outputs.tf similarity index 100% rename from community/modules/compute/gke-node-pool/outputs.tf rename to modules/compute/gke-node-pool/outputs.tf diff --git a/community/modules/compute/gke-node-pool/threads_per_core_calc.tf b/modules/compute/gke-node-pool/threads_per_core_calc.tf similarity index 100% rename from community/modules/compute/gke-node-pool/threads_per_core_calc.tf rename to modules/compute/gke-node-pool/threads_per_core_calc.tf diff --git a/community/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf similarity index 100% rename from community/modules/compute/gke-node-pool/variables.tf rename to modules/compute/gke-node-pool/variables.tf diff --git a/community/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf similarity index 100% rename from community/modules/compute/gke-node-pool/versions.tf rename to modules/compute/gke-node-pool/versions.tf diff --git a/community/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md similarity index 100% rename from community/modules/scheduler/gke-cluster/README.md rename to modules/scheduler/gke-cluster/README.md diff --git a/community/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf similarity index 100% rename from community/modules/scheduler/gke-cluster/main.tf rename to modules/scheduler/gke-cluster/main.tf diff --git a/community/modules/scheduler/gke-cluster/metadata.yaml b/modules/scheduler/gke-cluster/metadata.yaml similarity index 100% rename from community/modules/scheduler/gke-cluster/metadata.yaml rename to modules/scheduler/gke-cluster/metadata.yaml diff --git a/community/modules/scheduler/gke-cluster/outputs.tf b/modules/scheduler/gke-cluster/outputs.tf similarity index 100% rename from community/modules/scheduler/gke-cluster/outputs.tf rename to modules/scheduler/gke-cluster/outputs.tf diff --git a/community/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf similarity index 100% rename from community/modules/scheduler/gke-cluster/variables.tf rename to modules/scheduler/gke-cluster/variables.tf diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf similarity index 100% rename from community/modules/scheduler/gke-cluster/versions.tf rename to modules/scheduler/gke-cluster/versions.tf From b74b9a887affd7e855e3b9988e8622c35fbb8406 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 12 Jul 2024 19:30:59 +0000 Subject: [PATCH 011/118] move gke pv from community to core, update examples and docs to reflect the migration --- examples/README.md | 2 +- {community/examples => examples}/hpc-gke.yaml | 6 +++--- {community/examples => examples}/ml-gke.yaml | 6 +++--- {community/examples => examples}/storage-gke.yaml | 14 +++++++------- modules/README.md | 8 ++++---- modules/compute/gke-job-template/README.md | 2 +- modules/compute/gke-node-pool/README.md | 10 +++++----- .../file-system/gke-persistent-volume/README.md | 12 ++++++------ .../file-system/gke-persistent-volume/main.tf | 0 .../gke-persistent-volume/metadata.yaml | 0 .../file-system/gke-persistent-volume/outputs.tf | 0 .../templates/filestore-pv.yaml.tftpl | 0 .../templates/filestore-pvc.yaml.tftpl | 0 .../templates/gcs-pv.yaml.tftpl | 0 .../templates/gcs-pvc.yaml.tftpl | 0 .../file-system/gke-persistent-volume/variables.tf | 0 .../file-system/gke-persistent-volume/versions.tf | 0 modules/scheduler/gke-cluster/README.md | 2 +- pkg/modulereader/metadata_legacy.go | 4 ++-- tools/cloud-build/daily-tests/builds/gke.yaml | 4 ++-- tools/duplicate-diff.py | 2 +- 21 files changed, 36 insertions(+), 36 deletions(-) rename {community/examples => examples}/hpc-gke.yaml (89%) rename {community/examples => examples}/ml-gke.yaml (92%) rename {community/examples => examples}/storage-gke.yaml (91%) rename {community/modules => modules}/file-system/gke-persistent-volume/README.md (95%) rename {community/modules => modules}/file-system/gke-persistent-volume/main.tf (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/metadata.yaml (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/outputs.tf (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/templates/filestore-pv.yaml.tftpl (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/templates/filestore-pvc.yaml.tftpl (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/templates/gcs-pvc.yaml.tftpl (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/variables.tf (100%) rename {community/modules => modules}/file-system/gke-persistent-volume/versions.tf (100%) diff --git a/examples/README.md b/examples/README.md index e1aba7e807..3c3c976636 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1390,7 +1390,7 @@ Toolkit. It includes: Example settings for a2 look like: ```yaml - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: disk_type: pd-balanced diff --git a/community/examples/hpc-gke.yaml b/examples/hpc-gke.yaml similarity index 89% rename from community/examples/hpc-gke.yaml rename to examples/hpc-gke.yaml index 6fee0931bc..dccdee033b 100644 --- a/community/examples/hpc-gke.yaml +++ b/examples/hpc-gke.yaml @@ -36,18 +36,18 @@ deployment_groups: ip_cidr_range: 10.0.32.0/20 - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] settings: enable_private_endpoint: false # Allows for access from authorized public IPs outputs: [instructions] - id: compute_pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] - id: job-template - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [compute_pool] settings: image: busybox diff --git a/community/examples/ml-gke.yaml b/examples/ml-gke.yaml similarity index 92% rename from community/examples/ml-gke.yaml rename to examples/ml-gke.yaml index d6ae26b173..ad22f41156 100644 --- a/community/examples/ml-gke.yaml +++ b/examples/ml-gke.yaml @@ -41,7 +41,7 @@ deployment_groups: ip_cidr_range: 10.0.32.0/20 - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] settings: enable_private_endpoint: false # Allows for access from authorized public IPs @@ -51,7 +51,7 @@ deployment_groups: outputs: [instructions] - id: g2-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: disk_type: pd-balanced @@ -65,7 +65,7 @@ deployment_groups: - gpu_driver_version: "DEFAULT" - id: job-template - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [g2-pool] settings: image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 diff --git a/community/examples/storage-gke.yaml b/examples/storage-gke.yaml similarity index 91% rename from community/examples/storage-gke.yaml rename to examples/storage-gke.yaml index ac38af4979..cd46c2d9c3 100644 --- a/community/examples/storage-gke.yaml +++ b/examples/storage-gke.yaml @@ -39,7 +39,7 @@ deployment_groups: ip_cidr_range: 10.0.32.0/20 - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] settings: enable_filestore_csi: true @@ -52,7 +52,7 @@ deployment_groups: outputs: [instructions] - id: debug_pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: name: debug @@ -69,7 +69,7 @@ deployment_groups: force_destroy: true - id: data-bucket-pv - source: community/modules/file-system/gke-persistent-volume + source: modules/file-system/gke-persistent-volume use: [gke_cluster, data-bucket] settings: {capacity_gb: 5000} @@ -81,13 +81,13 @@ deployment_groups: settings: {local_mount: /shared} - id: shared-filestore-pv - source: community/modules/file-system/gke-persistent-volume + source: modules/file-system/gke-persistent-volume use: [gke_cluster, filestore] ### Shared Storage Job ### - id: shared-fs-job - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: - gke_cluster - debug_pool @@ -117,7 +117,7 @@ deployment_groups: ### Ephemeral Storage ### - id: local-ssd-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: name: local-ssd @@ -125,7 +125,7 @@ deployment_groups: local_ssd_count_ephemeral_storage: 1 - id: ephemeral-storage-job - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [local-ssd-pool] settings: name: ephemeral-storage-job diff --git a/modules/README.md b/modules/README.md index d369c65c97..ef23479bf4 100644 --- a/modules/README.md +++ b/modules/README.md @@ -58,8 +58,8 @@ Modules that are still in development and less stable are labeled with the Notebook. Primarily used for [FSI - MonteCarlo Tutorial][fsi-montecarlo-on-batch-tutorial]. [vm-instance]: compute/vm-instance/README.md -[gke-node-pool]: ../community/modules/compute/gke-node-pool/README.md -[gke-job-template]: ../community/modules/compute/gke-job-template/README.md +[gke-node-pool]: ../modules/compute/gke-node-pool/README.md +[gke-job-template]: ../modules/compute/gke-job-template/README.md [schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md [schedmd-slurm-gcp-v6-partition]: ../community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -108,7 +108,7 @@ Modules that are still in development and less stable are labeled with the [intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md [nfs-server]: ../community/modules/file-system/nfs-server/README.md [cloud-storage-bucket]: ../community/modules/file-system/cloud-storage-bucket/README.md -[gke-persistent-volume]: ../community/modules/file-system/gke-persistent-volume/README.md +[gke-persistent-volume]: ../modules/file-system/gke-persistent-volume/README.md ### Monitoring @@ -209,7 +209,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md -[gke-cluster]: ../community/modules/scheduler/gke-cluster/README.md +[gke-cluster]: ../modules/scheduler/gke-cluster/README.md [htcondor-setup]: ../community/modules/scheduler/htcondor-setup/README.md [htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md diff --git a/modules/compute/gke-job-template/README.md b/modules/compute/gke-job-template/README.md index 3433af53f6..cf14d8b24d 100644 --- a/modules/compute/gke-job-template/README.md +++ b/modules/compute/gke-job-template/README.md @@ -19,7 +19,7 @@ The following example creates a GKE job template file. ```yaml - id: job-template - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [compute_pool] settings: node_count: 3 diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index fa2b31a761..a49823ea41 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -13,7 +13,7 @@ The following example creates a GKE node group. ```yaml - id: compute_pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] ``` @@ -83,7 +83,7 @@ fixed number of attached GPUs: ```yaml - id: simple-a2-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: machine_type: a2-highgpu-1g @@ -105,7 +105,7 @@ an A100 GPU: ```yaml - id: multi-instance-gpu-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: machine_type: a2-highgpu-1g @@ -128,7 +128,7 @@ The following is an example of ```yaml - id: time-sharing-gpu-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: machine_type: a2-highgpu-1g @@ -146,7 +146,7 @@ Finally, the following is an example of using a GPU attached to an `n1` machine: ```yaml - id: t4-pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [gke_cluster] settings: machine_type: n1-standard-16 diff --git a/community/modules/file-system/gke-persistent-volume/README.md b/modules/file-system/gke-persistent-volume/README.md similarity index 95% rename from community/modules/file-system/gke-persistent-volume/README.md rename to modules/file-system/gke-persistent-volume/README.md index a1af5cfc09..b618a8e278 100644 --- a/community/modules/file-system/gke-persistent-volume/README.md +++ b/modules/file-system/gke-persistent-volume/README.md @@ -20,7 +20,7 @@ The following example creates a Filestore and then uses the ```yaml - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] settings: master_authorized_networks: @@ -34,11 +34,11 @@ The following example creates a Filestore and then uses the local_mount: /data - id: datafs-pv - source: community/modules/file-system/gke-persistent-volume + source: modules/file-system/gke-persistent-volume use: [datafs, gke_cluster] - id: job-template - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [datafs-pv, compute_pool] ``` @@ -48,7 +48,7 @@ The following example creates a GCS bucket and then uses the ```yaml - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] settings: master_authorized_networks: @@ -61,11 +61,11 @@ The following example creates a GCS bucket and then uses the local_mount: /data - id: datafs-pv - source: community/modules/file-system/gke-persistent-volume + source: modules/file-system/gke-persistent-volume use: [data-bucket, gke_cluster] - id: job-template - source: community/modules/compute/gke-job-template + source: modules/compute/gke-job-template use: [datafs-pv, compute_pool, gke_cluster] ``` diff --git a/community/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf similarity index 100% rename from community/modules/file-system/gke-persistent-volume/main.tf rename to modules/file-system/gke-persistent-volume/main.tf diff --git a/community/modules/file-system/gke-persistent-volume/metadata.yaml b/modules/file-system/gke-persistent-volume/metadata.yaml similarity index 100% rename from community/modules/file-system/gke-persistent-volume/metadata.yaml rename to modules/file-system/gke-persistent-volume/metadata.yaml diff --git a/community/modules/file-system/gke-persistent-volume/outputs.tf b/modules/file-system/gke-persistent-volume/outputs.tf similarity index 100% rename from community/modules/file-system/gke-persistent-volume/outputs.tf rename to modules/file-system/gke-persistent-volume/outputs.tf diff --git a/community/modules/file-system/gke-persistent-volume/templates/filestore-pv.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/filestore-pv.yaml.tftpl similarity index 100% rename from community/modules/file-system/gke-persistent-volume/templates/filestore-pv.yaml.tftpl rename to modules/file-system/gke-persistent-volume/templates/filestore-pv.yaml.tftpl diff --git a/community/modules/file-system/gke-persistent-volume/templates/filestore-pvc.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/filestore-pvc.yaml.tftpl similarity index 100% rename from community/modules/file-system/gke-persistent-volume/templates/filestore-pvc.yaml.tftpl rename to modules/file-system/gke-persistent-volume/templates/filestore-pvc.yaml.tftpl diff --git a/community/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl similarity index 100% rename from community/modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl rename to modules/file-system/gke-persistent-volume/templates/gcs-pv.yaml.tftpl diff --git a/community/modules/file-system/gke-persistent-volume/templates/gcs-pvc.yaml.tftpl b/modules/file-system/gke-persistent-volume/templates/gcs-pvc.yaml.tftpl similarity index 100% rename from community/modules/file-system/gke-persistent-volume/templates/gcs-pvc.yaml.tftpl rename to modules/file-system/gke-persistent-volume/templates/gcs-pvc.yaml.tftpl diff --git a/community/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf similarity index 100% rename from community/modules/file-system/gke-persistent-volume/variables.tf rename to modules/file-system/gke-persistent-volume/variables.tf diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf similarity index 100% rename from community/modules/file-system/gke-persistent-volume/versions.tf rename to modules/file-system/gke-persistent-volume/versions.tf diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index f98ce0fcc5..996bd10b71 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -26,7 +26,7 @@ requirements. ip_cidr_range: 10.0.32.0/20 - id: gke_cluster - source: community/modules/scheduler/gke-cluster + source: modules/scheduler/gke-cluster use: [network1] ``` diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 2571d262fd..1c55ddf3fc 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -87,10 +87,10 @@ func defaultAPIList(source string) []string { "iam.googleapis.com", "storage.googleapis.com", }, - "community/modules/compute/gke-node-pool": { + "modules/compute/gke-node-pool": { "container.googleapis.com", }, - "community/modules/scheduler/gke-cluster": { + "modules/scheduler/gke-cluster": { "container.googleapis.com", }, "modules/scheduler/batch-job-template": { diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index bc1c6c3e9d..6ef10c5859 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -36,7 +36,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=community/examples/hpc-gke.yaml + SG_EXAMPLE=examples/hpc-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} @@ -47,7 +47,7 @@ steps: echo ' zone: us-central1-a' >> $${SG_EXAMPLE} echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} - echo ' source: community/modules/compute/gke-node-pool' >> $${SG_EXAMPLE} + echo ' source: compute/gke-node-pool' >> $${SG_EXAMPLE} echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index d67fb391b3..e63cbabd40 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -46,7 +46,7 @@ "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf", ], [ - "community/modules/compute/gke-node-pool/threads_per_core_calc.tf", + "modules/compute/gke-node-pool/threads_per_core_calc.tf", "modules/compute/vm-instance/threads_per_core_calc.tf", ], [ # Slurm V5 From 4301f817e991614a629068a170a1b580e184bf2e Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 12 Jul 2024 20:31:14 +0000 Subject: [PATCH 012/118] update gke tests to reflect migration --- examples/README.md | 4 ++-- tools/cloud-build/daily-tests/builds/gke-storage.yaml | 2 +- tools/cloud-build/daily-tests/tests/gke-storage.yml | 2 +- tools/cloud-build/daily-tests/tests/gke.yml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/README.md b/examples/README.md index 3c3c976636..7c397a7e66 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1371,7 +1371,7 @@ secondary IP ranges defined. The `gke-job-template` module is used to create a job file that can be submitted to the cluster using `kubectl` and will run on the specified node pool. -[hpc-gke.yaml]: ../community/examples/hpc-gke.yaml +[hpc-gke.yaml]: ../examples/hpc-gke.yaml ### [ml-gke.yaml] ![community-badge] ![experimental-badge] @@ -1452,7 +1452,7 @@ cleaned up when the job is deleted. > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. -[storage-gke.yaml]: ../community/examples/storage-gke.yaml +[storage-gke.yaml]: ../examples/storage-gke.yaml ### [htc-htcondor.yaml] ![community-badge] ![experimental-badge] diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 6c4ad9cf26..16d8b92587 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -40,7 +40,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=community/examples/storage-gke.yaml + SG_EXAMPLE=examples/storage-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/tests/gke-storage.yml b/tools/cloud-build/daily-tests/tests/gke-storage.yml index 8deb56d90f..9beb5eba4c 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage.yml @@ -16,7 +16,7 @@ test_name: storage-gke deployment_name: gke-storage-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/storage-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/storage-gke.yaml" network: "{{ test_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: [] diff --git a/tools/cloud-build/daily-tests/tests/gke.yml b/tools/cloud-build/daily-tests/tests/gke.yml index d6a0fce885..7f4e97bd52 100644 --- a/tools/cloud-build/daily-tests/tests/gke.yml +++ b/tools/cloud-build/daily-tests/tests/gke.yml @@ -16,7 +16,7 @@ test_name: hpc-gke deployment_name: gke-{{ build }} zone: us-central1-a # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/hpc-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/hpc-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: [] From 3886325685ae1905ed23889a96d8b0538fc1209e Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 12 Jul 2024 22:18:40 +0000 Subject: [PATCH 013/118] move pre-existing-gke-cluster to core --- modules/README.md | 12 ++++++------ .../scheduler/pre-existing-gke-cluster/README.md | 2 +- .../scheduler/pre-existing-gke-cluster/main.tf | 0 .../scheduler/pre-existing-gke-cluster/metadata.yaml | 0 .../scheduler/pre-existing-gke-cluster/outputs.tf | 0 .../scheduler/pre-existing-gke-cluster/variables.tf | 0 .../scheduler/pre-existing-gke-cluster/versions.tf | 0 tools/duplicate-diff.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/README.md (98%) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/main.tf (100%) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/metadata.yaml (100%) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/outputs.tf (100%) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/variables.tf (100%) rename {community/modules => modules}/scheduler/pre-existing-gke-cluster/versions.tf (100%) diff --git a/modules/README.md b/modules/README.md index 77f02242ee..9a039cd61d 100644 --- a/modules/README.md +++ b/modules/README.md @@ -47,9 +47,9 @@ Modules that are still in development and less stable are labeled with the Creates a TPU nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module. * **[schedmd-slurm-gcp-v6-nodeset-dynamic]** ![community-badge] ![experimental-badge]: Creates a dynamic nodeset to be used by the [schedmd-slurm-gcp-v6-partition] module and instance template. -* **[gke-node-pool]** ![community-badge] ![experimental-badge] : Creates a +* **[gke-node-pool]** ![core-badge] ![experimental-badge] : Creates a Kubernetes node pool using GKE. -* **[gke-job-template]** ![community-badge] ![experimental-badge] : Creates a +* **[gke-job-template]** ![core-badge] ![experimental-badge] : Creates a Kubernetes job file to be used with a [gke-node-pool]. * **[htcondor-execute-point]** ![community-badge] ![experimental-badge] : Manages a group of execute points for use in an [HTCondor @@ -104,7 +104,7 @@ Modules that are still in development and less stable are labeled with the * **[Intel-DAOS]** ![community-badge] : Creates a [DAOS](https://docs.daos.io/) file system. * **[cloud-storage-bucket]** ![community-badge] ![experimental-badge] : Creates a Google Cloud Storage (GCS) bucket. -* **[gke-persistent-volume]** ![community-badge] ![experimental-badge] : Creates persistent volumes and persistent volume claims for shared storage. +* **[gke-persistent-volume]** ![core-badge] ![experimental-badge] : Creates persistent volumes and persistent volume claims for shared storage. * **[nfs-server]** ![community-badge] ![experimental-badge] : Creates a VM and configures an NFS server that can be mounted by other VM. @@ -189,9 +189,9 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca template that works with other Toolkit modules. * **[batch-login-node]** ![core-badge] : Creates a VM that can be used for submission of Google Cloud Batch jobs. -* **[gke-cluster]** ![community-badge] ![experimental-badge] : Creates a +* **[gke-cluster]** ![core-badge] ![experimental-badge] : Creates a Kubernetes cluster using GKE. -* **[pre-existing-gke-cluster]** ![community-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module. +* **[pre-existing-gke-cluster]** ![core-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module. * **[schedmd-slurm-gcp-v5-controller]** ![community-badge] : Creates a Slurm controller node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-login]** ![community-badge] : @@ -218,7 +218,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md [gke-cluster]: ../modules/scheduler/gke-cluster/README.md -[pre-existing-gke-cluster]: ../community/modules/scheduler/pre-existing-gke-cluster/README.md +[pre-existing-gke-cluster]: ../modules/scheduler/pre-existing-gke-cluster/README.md [htcondor-setup]: ../community/modules/scheduler/htcondor-setup/README.md [htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md diff --git a/community/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md similarity index 98% rename from community/modules/scheduler/pre-existing-gke-cluster/README.md rename to modules/scheduler/pre-existing-gke-cluster/README.md index ebd4950e1b..73a75b1117 100644 --- a/community/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -17,7 +17,7 @@ GKE node pool will be created. ```yaml - id: existing-gke-cluster - source: community/modules/scheduler/pre-existing-gke-cluster + source: modules/scheduler/pre-existing-gke-cluster settings: project_id: $(vars.project_id) cluster_name: my-gke-cluster diff --git a/community/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf similarity index 100% rename from community/modules/scheduler/pre-existing-gke-cluster/main.tf rename to modules/scheduler/pre-existing-gke-cluster/main.tf diff --git a/community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml b/modules/scheduler/pre-existing-gke-cluster/metadata.yaml similarity index 100% rename from community/modules/scheduler/pre-existing-gke-cluster/metadata.yaml rename to modules/scheduler/pre-existing-gke-cluster/metadata.yaml diff --git a/community/modules/scheduler/pre-existing-gke-cluster/outputs.tf b/modules/scheduler/pre-existing-gke-cluster/outputs.tf similarity index 100% rename from community/modules/scheduler/pre-existing-gke-cluster/outputs.tf rename to modules/scheduler/pre-existing-gke-cluster/outputs.tf diff --git a/community/modules/scheduler/pre-existing-gke-cluster/variables.tf b/modules/scheduler/pre-existing-gke-cluster/variables.tf similarity index 100% rename from community/modules/scheduler/pre-existing-gke-cluster/variables.tf rename to modules/scheduler/pre-existing-gke-cluster/variables.tf diff --git a/community/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf similarity index 100% rename from community/modules/scheduler/pre-existing-gke-cluster/versions.tf rename to modules/scheduler/pre-existing-gke-cluster/versions.tf diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 00bfe2eeb6..567cdda571 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -45,7 +45,7 @@ "community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf", - "community/modules/compute/gke-node-pool/gpu_definition.tf", + "modules/compute/gke-node-pool/gpu_definition.tf", ], [ "modules/compute/gke-node-pool/threads_per_core_calc.tf", From 4f270d6c76e53471b1e5d99a8650b6981ab11094 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 12 Jul 2024 22:45:52 +0000 Subject: [PATCH 014/118] fix pre-commit --- examples/README.md | 2 +- tools/cloud-build/daily-tests/builds/ml-gke.yaml | 2 +- tools/cloud-build/daily-tests/tests/ml-gke.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/README.md b/examples/README.md index 226d6d66d5..77a6c5b06a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1445,7 +1445,7 @@ guest_accelerator: Once you have deployed the blueprint, follow output instructions to _fetch credentials for the created cluster_ and _submit a job calling `nvidia_smi`_. -[ml-gke.yaml]: ../community/examples/ml-gke.yaml +[ml-gke.yaml]: ../examples/ml-gke.yaml [`kubernetes-operations`]: ../community/modules/scripts/kubernetes-operations/README.md ### [storage-gke.yaml] ![community-badge] ![experimental-badge] diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index 80033c5f9f..83da847769 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -36,7 +36,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=community/examples/ml-gke.yaml + SG_EXAMPLE=examples/ml-gke.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/tests/ml-gke.yml b/tools/cloud-build/daily-tests/tests/ml-gke.yml index 7c475a2fae..d26cab3869 100644 --- a/tools/cloud-build/daily-tests/tests/ml-gke.yml +++ b/tools/cloud-build/daily-tests/tests/ml-gke.yml @@ -17,7 +17,7 @@ deployment_name: ml-gke-{{ build }} region: asia-southeast1 zone: asia-southeast1-b # for remote node workspace: /workspace -blueprint_yaml: "{{ workspace }}/community/examples/ml-gke.yaml" +blueprint_yaml: "{{ workspace }}/examples/ml-gke.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" cli_deployment_vars: From 4225bbdfa1445815190866cf47321fce79f7bfad Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 12 Jul 2024 23:31:01 +0000 Subject: [PATCH 015/118] Add terraform version for `ghpc --version` --- cmd/root.go | 13 +++++++++++-- pkg/shell/terraform.go | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index b58fee9041..ba643aa4e7 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -22,6 +22,7 @@ import ( "fmt" "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/logging" + "hpc-toolkit/pkg/shell" "os" "os/exec" "path/filepath" @@ -79,13 +80,21 @@ func Execute() error { if len(GitBranch) == 0 { GitBranch = "detached HEAD" } + annotation["version"] = GitTagVersion annotation["branch"] = GitBranch annotation["commitInfo"] = GitCommitInfo - rootCmd.SetVersionTemplate(`ghpc version {{index .Annotations "version"}} + tmpl := `ghpc version {{index .Annotations "version"}} Built from '{{index .Annotations "branch"}}' branch. Commit info: {{index .Annotations "commitInfo"}} -`) +` + tfVersion, _ := shell.TfVersion() + if tfVersion != "" { + annotation["tfVersion"] = tfVersion + tmpl += `Terraform version: {{index .Annotations "tfVersion"}} +` + } + rootCmd.SetVersionTemplate(tmpl) } return rootCmd.Execute() diff --git a/pkg/shell/terraform.go b/pkg/shell/terraform.go index da25f073fe..99b1008482 100644 --- a/pkg/shell/terraform.go +++ b/pkg/shell/terraform.go @@ -54,14 +54,21 @@ type outputValue struct { Value cty.Value } -// ConfigureTerraform returns a Terraform object used to execute commands -func ConfigureTerraform(workingDir string) (*tfexec.Terraform, error) { +func tfExecPath() (string, error) { path, err := exec.LookPath("terraform") if err != nil { - return nil, config.HintError{ + return "", config.HintError{ Hint: "must have a copy of terraform installed in PATH (obtain at https://terraform.io)", - Err: err, - } + Err: err} + } + return path, nil +} + +// ConfigureTerraform returns a Terraform object used to execute commands +func ConfigureTerraform(workingDir string) (*tfexec.Terraform, error) { + path, err := tfExecPath() + if err != nil { + return nil, err } return tfexec.NewTerraform(workingDir, path) } @@ -424,3 +431,24 @@ func ImportInputs(groupDir string, artifactsDir string, bp config.Blueprint) err func Destroy(tf *tfexec.Terraform, b ApplyBehavior) error { return applyOrDestroy(tf, b, true) } + +func TfVersion() (string, error) { + path, err := tfExecPath() + if err != nil { + return "", err + } + + out, err := exec.Command(path, "version", "--json").Output() + if err != nil { + return "", err + } + + var version struct { + TerraformVersion string `json:"terraform_version"` + } + if err := json.Unmarshal(out, &version); err != nil { + return "", err + } + + return version.TerraformVersion, nil +} From 86b76b548c34d43a3d5bd310af84c3c5d64a82ae Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 2 Jul 2024 22:47:42 +0000 Subject: [PATCH 016/118] Move slurm_files from slurm-gcp repo here --- .../modules/slurm_files/README.md | 200 ++ .../modules/slurm_files/README_TF.md | 119 + .../slurm_files/files/external_epilog.sh | 18 + .../slurm_files/files/external_prolog.sh | 18 + .../slurm_files/files/setup_external.sh | 113 + .../modules/slurm_files/main.tf | 297 +++ .../modules/slurm_files/outputs.tf | 60 + .../modules/slurm_files/scripts/conf.py | 499 ++++ .../modules/slurm_files/scripts/load_bq.py | 329 +++ .../modules/slurm_files/scripts/resume.py | 709 ++++++ .../modules/slurm_files/scripts/setup.py | 545 +++++ .../scripts/setup_network_storage.py | 307 +++ .../scripts/slurm_gcp_plugins/README.md | 107 + .../scripts/slurm_gcp_plugins/__init__.py | 135 ++ .../slurm_gcp_plugins/max_hops/README.md | 38 + .../slurm_gcp_plugins/max_hops/__init__.py | 58 + .../slurm_gcp_plugins/test_plugin/README.md | 16 + .../slurm_gcp_plugins/test_plugin/__init__.py | 27 + .../slurm_gcp_plugins/utils/__init__.py | 56 + .../modules/slurm_files/scripts/slurmsync.py | 575 +++++ .../modules/slurm_files/scripts/startup.sh | 197 ++ .../modules/slurm_files/scripts/suspend.py | 184 ++ .../slurm_files/scripts/tests/README.md | 6 + .../scripts/tests/test_topology.py | 120 + .../slurm_files/scripts/tests/test_util.py | 148 ++ .../modules/slurm_files/scripts/util.py | 2083 +++++++++++++++++ .../modules/slurm_files/variables.tf | 467 ++++ .../modules/slurm_files/versions.tf | 37 + .../slurm_files.tf | 2 +- 29 files changed, 7469 insertions(+), 1 deletion(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_epilog.sh create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_prolog.sh create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py create mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/versions.tf diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md new file mode 100644 index 0000000000..12a8861937 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -0,0 +1,200 @@ +# Module: Slurm Cluster + +[FAQ](../../docs/faq.md) | [Troubleshooting](../../docs/troubleshooting.md) | +[Glossary](../../docs/glossary.md) + + + +- [Module: Slurm Cluster](#module-slurm-cluster) + - [Overview](#overview) + - [Usage](#usage) + - [Dependencies](#dependencies) + - [Software](#software) + - [Required](#required) + - [Optional](#optional) + - [TerraformUser](#terraformuser) + - [Required](#required-1) + - [Optional](#optional-1) + - [Controller SA](#controller-sa) + - [Required](#required-2) + - [Optional](#optional-2) + - [Compute SA](#compute-sa) + - [Optional](#optional-3) + - [Login SA](#login-sa) + - [Optional](#optional-4) + - [Module API](#module-api) + + + +## Overview + +This module creates a [Slurm](../../docs/glossary.md#slurm) cluster on +[GCP](../../docs/glossary.md#gcp). There are two modes of operation: cloud; and +hybrid. Cloud mode will create a VM controller. Hybrid mode will generate +`cloud.conf` and `cloud_gres.conf` files to be included in the on-prem +configuration files, while managing a `config.yaml` file for internal module +use. + +Partitions define what compute resources are available to the controller so it +may allocate jobs. Slurm will resume/create compute instances as needed to run +allocated jobs and will suspend/terminate the instances after they are no longer +needed (e.g. IDLE for SuspendTimeout duration). Static nodes are persistent; +they are exempt from being suspended/terminated under normal conditions. Dynamic +nodes are burstable; they will scale up and down with workload. + +> **WARNING:** Destroying the controller before it has suspended/terminated all +> static and dynamic node instances and supporting resources (e.g. placement +> groups, subscription) will leave those resources orphaned unless cleanup +> options are enabled (.e.g `enable_cleanup_compute`, +> `enable_cleanup_subscriptions`). + +## Usage + +See [examples](./examples/slurm_cluster/) directory for sample usages. + +See below for a simple inclusion within your own terraform project. + +```hcl +module "slurm_cluster" { + source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster?ref=v5.0.0" + + project_id = "" + + slurm_cluster_name = "" + + # ... omitted ... +} +``` + +> **NOTE:** Because this module is not hosted on +> [Terraform Registry](../../docs/glossary.md#terraform-registry), the version +> must be strictly controlled via +> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) +> syntax on the source line. + +## Dependencies + +### Software + +Certain software must be installed on the local machine or APIs enabled in +[GCP](../../docs/glossary.md#gcp) for +[TerraformUser](../../docs/glossary.md#terraformuser) to be able to use this +module. + +#### Required + +- [Terraform](https://www.terraform.io/downloads.html) is installed. +- [GCP Cloud SDK](https://cloud.google.com/sdk/downloads) is installed. +- [Compute Engine API](../../docs/glossary.md#compute-engine) is enabled. + +#### Optional + +- [Python](../../docs/glossary.md#python) is installed. + - Required Version: `>= 3.6.0, < 4.0.0` + - Required when any of: + - `enable_hybrid=true` + - `enable_cleanup_compute=true` + - `enable_cleanup_subscriptions=true` + - `enable_reconfigure=true` +- [Pip](../../../docs/glossary.md#pip) packages are installed. + - Required when any of: + - `enable_hybrid=true` + - `enable_cleanup_compute=true` + - `enable_cleanup_subscriptions=true` + - `enable_reconfigure=true` + - `pip3 install -r ../../scripts/requirements.txt --user` +- [Private Google Access](../../docs/glossary.md#private-google-access) is + enabled. + - Required when any instances only have internal IPs. +- [Secret Manager API](../../docs/glossary.md#secret-manager) is enabled. + - Required when `cloudsql != null`. +- [Pub/Sub API](../../docs/glossary.md#pubsub) is enabled. + - Required when any of: + - `enable_cleanup_subscriptions=true` + - `enable_reconfigure=true` +- [Bigquery API](../../docs/glossary.md#bigquery) is enabled. + - Required when `enable_bigquery_load=true`. + +### TerraformUser + +[TerraformUser](../../docs/glossary.md#terraformuser) authenticates with +credentials to [Google Cloud](../../docs/glossary.md#gcp). It is recommended to +create a principal [IAM](../../docs/glossary.md#iam) for this user and associate +[roles](../../docs/glossary.md#iam-roles) to them. Optionally, the TerraformUser +can operate through a [service account](../../docs/glossary.md#service-account). + +#### Required + +- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) + +#### Optional + +- Pub/Sub Admin (`roles/pubsub.admin`) + - Required when `enable_reconfigure=true`. +- Secret Manager Admin (`roles/secretmanager.admin`) + - Required when `cloudsql != null`. +- Service Account User (`roles/iam.serviceAccountUser`) + - Required when [TerraformUser](../../docs/glossary.md#terraformuser) is using + an [service account](../../docs/glossary.md#service-account) to + authenticate. + +### Controller SA + +[Service account](../../docs/glossary.md#service-account) intended to be +associated with the controller +[instance template](../../docs/glossary.md#instance-template) for +[slurm_controller_instance](../slurm_controller_instance/). + +#### Required + +- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) +- Compute Instance Admin (beta) (`roles/compute.instanceAdmin`) +- Service Account User (`roles/iam.serviceAccountUser`) + +#### Optional + +- BigQuery Data Editor (`roles/bigquery.dataEditor`) + - Required when `enable_bigquery_load=true`. +- Cloud SQL Editor (`roles/cloudsql.editor`) + - Required when all of: + - `cloudsql != null` + - Communicating to CloudSQL instance +- Logs Writer (`roles/logging.logWriter`) + - Recommended. +- Monitoring Metric Writer (`roles/monitoring.metricWriter`) + - Recommended. +- Pub/Sub Admin (`roles/pubsub.admin`) + - Required when `enable_reconfigure=true`. + +### Compute SA + +[Service account](../../docs/glossary.md#service-account) intended to be +associated with the compute +[instance templates](../../docs/glossary.md#instance-template) created by +[slurm_partition](../slurm_partition/). + +#### Optional + +- Logs Writer (`roles/logging.logWriter`) + - Recommended. +- Monitoring Metric Writer (`roles/monitoring.metricWriter`) + - Recommended. + +### Login SA + +[Service account](../../docs/glossary.md#service-account) intended to be +associated with the login +[instance templates](../../docs/glossary.md#instance-template) created by +[slurm_partition](../slurm_partition/). + +#### Optional + +- Logs Writer (`roles/logging.logWriter`) + - Recommended. +- Monitoring Metric Writer (`roles/monitoring.metricWriter`) + - Recommended. + +## Module API + +For the terraform module API reference, please see +[README_TF.md](./README_TF.md). diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md new file mode 100644 index 0000000000..f30666aec0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md @@ -0,0 +1,119 @@ +# bucket_files + + +Copyright (C) SchedMD LLC. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.3 | +| [archive](#requirement\_archive) | ~> 2.0 | +| [google](#requirement\_google) | >= 3.53 | +| [local](#requirement\_local) | ~> 2.0 | +| [random](#requirement\_random) | ~> 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [archive](#provider\_archive) | ~> 2.0 | +| [google](#provider\_google) | >= 3.53 | +| [local](#provider\_local) | ~> 2.0 | +| [random](#provider\_random) | ~> 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_storage_bucket_object.compute_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.controller_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.devel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | +| [archive_file.slurm_gcp_devel_zip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [google_storage_bucket.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/storage_bucket) | data source | +| [local_file.external_epilog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | +| [local_file.external_prolog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | +| [local_file.setup_external](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | +| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | +| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | +| [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_startup\_scripts](#input\_controller\_startup\_scripts) | List of scripts to be ran on controller VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
* /usr/local/etc/slurm
* /etc/munge
* /home
* /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | +| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | +| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | +| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | +| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/v5/tools/prologs-epilogs/README.md | `bool` | `false` | no | +| [enable\_hybrid](#input\_enable\_hybrid) | Enables use of hybrid controller mode. When true, controller\_hybrid\_config will
be used instead of controller\_instance\_config and will disable login instances. | `bool` | `false` | no | +| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": null
}
| no | +| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | +| [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | +| [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | +| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | +| [network\_storage](#input\_network\_storage) | Storage to mounted on all instances.
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Cluster nodenets, as a list. | `list(any)` | `[]` | no | +| [nodeset\_dyn](#input\_nodeset\_dyn) | Cluster nodenets (dynamic), as a list. | `list(any)` | `[]` | no | +| [nodeset\_startup\_scripts](#input\_nodeset\_startup\_scripts) | List of scripts to be ran on compute VM startup in the specific nodeset. |
map(list(object({
filename = string
content = string
})))
| `{}` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Cluster nodenets (TPU), as a list. | `list(any)` | `[]` | no | +| [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:
cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py. | `string` | `null` | no | +| [partitions](#input\_partitions) | Cluster partitions as a list. | `list(any)` | `[]` | no | +| [project\_id](#input\_project\_id) | The GCP project ID. | `string` | n/a | yes | +| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | The cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | +| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | +| [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | +| [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | +| [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `"6818"` | no | +| [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | +| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | +| [config](#output\_config) | Cluster configuration. | +| [nodeset](#output\_nodeset) | Cluster nodesets. | +| [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | +| [nodeset\_tpu](#output\_nodeset\_tpu) | Cluster nodesets (TPU). | +| [partitions](#output\_partitions) | Cluster partitions. | +| [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_epilog.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_epilog.sh new file mode 100755 index 0000000000..db514fc9e5 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_epilog.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ -x /opt/apps/adm/slurm/slurm_epilog ]]; then + exec /opt/apps/adm/slurm/slurm_epilog +fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_prolog.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_prolog.sh new file mode 100755 index 0000000000..37a91bb1ea --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/external_prolog.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ -x /opt/apps/adm/slurm/slurm_prolog ]]; then + exec /opt/apps/adm/slurm/slurm_prolog +fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh new file mode 100755 index 0000000000..c21f7cbdbd --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e -o pipefail + +SLURM_EXTERNAL_ROOT="/opt/apps/adm/slurm" +SLURM_MUX_FILE="slurm_mux" + +mkdir -p "${SLURM_EXTERNAL_ROOT}" +mkdir -p "${SLURM_EXTERNAL_ROOT}/logs" +mkdir -p "${SLURM_EXTERNAL_ROOT}/etc" + +# create common prolog / epilog "multiplex" script +if [ ! -f "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" ]; then + # indentation matters in EOT below; do not blindly edit! + cat <<'EOT' >"${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CMD="${0##*/}" +# Locate script +BASE=$(readlink -f $0) +BASE=${BASE%/*} + +export CLUSTER_ADM_BASE=${BASE} + +# Source config file if it exists for extra DEBUG settings +# used below +SLURM_MUX_CONF=${CLUSTER_ADM_BASE}/etc/slurm_mux.conf +if [[ -r ${SLURM_MUX_CONF} ]]; then + source ${SLURM_MUX_CONF} +fi + +# Setup logging if configured and directory exists +LOGFILE="/dev/null" +if [[ -d ${DEBUG_SLURM_MUX_LOG_DIR} && ${DEBUG_SLURM_MUX_ENABLE_LOG} == "yes" ]]; then + LOGFILE="${DEBUG_SLURM_MUX_LOG_DIR}/${CMD}-${SLURM_SCRIPT_CONTEXT}-job-${SLURMD_NODENAME}.log" + exec >>${LOGFILE} 2>&1 +fi + +# Global scriptlets +for SCRIPTLET in ${BASE}/${SLURM_SCRIPT_CONTEXT}.d/*.${SLURM_SCRIPT_CONTEXT}; do + if [[ -x ${SCRIPTLET} ]]; then + echo "Running ${SCRIPTLET}" + ${SCRIPTLET} $@ >>${LOGFILE} 2>&1 + fi +done + +# Per partition scriptlets +for SCRIPTLET in ${BASE}/partition-${SLURM_JOB_PARTITION}-${SLURM_SCRIPT_CONTEXT}.d/*.${SLURM_SCRIPT_CONTEXT}; do + if [[ -x ${SCRIPTLET} ]]; then + echo "Running ${SCRIPTLET}" + ${SCRIPTLET} $@ >>${LOGFILE} 2>&1 + fi +done +EOT +fi + +# ensure proper permissions on slurm_mux script +chmod 0755 "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" + +# create default slurm_mux configuration file +if [ ! -f "${SLURM_EXTERNAL_ROOT}/etc/slurm_mux.conf" ]; then + cat <<'EOT' >"${SLURM_EXTERNAL_ROOT}/etc/slurm_mux.conf" +# these settings are intended for temporary debugging purposes only; leaving +# them enabled will write files for each job to a shared NFS directory without +# any automated cleanup +DEBUG_SLURM_MUX_LOG_DIR=/opt/apps/adm/slurm/logs +DEBUG_SLURM_MUX_ENABLE_LOG=no +EOT +fi + +# create epilog symbolic link +if [ ! -L "${SLURM_EXTERNAL_ROOT}/slurm_epilog" ]; then + cd ${SLURM_EXTERNAL_ROOT} + # delete existing file if necessary + rm -f slurm_epilog + ln -s ${SLURM_MUX_FILE} slurm_epilog + cd - >/dev/null +fi + +# create prolog symbolic link +if [ ! -L "${SLURM_EXTERNAL_ROOT}/slurm_prolog" ]; then + cd ${SLURM_EXTERNAL_ROOT} + # delete existing file if necessary + rm -f slurm_prolog + ln -s ${SLURM_MUX_FILE} slurm_prolog + cd - >/dev/null +fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf new file mode 100644 index 0000000000..896c17dc36 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -0,0 +1,297 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + scripts_dir = abspath("${path.module}/../../../../scripts") + + bucket_dir = coalesce(var.bucket_dir, format("%s-files", var.slurm_cluster_name)) +} + +######## +# DATA # +######## + +data "google_storage_bucket" "this" { + name = var.bucket_name +} + +########## +# RANDOM # +########## + +resource "random_uuid" "cluster_id" { +} + +################## +# CLUSTER CONFIG # +################## + +locals { + config = { + enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins + enable_bigquery_load = var.enable_bigquery_load + cloudsql_secret = var.cloudsql_secret + cluster_id = random_uuid.cluster_id.result + project = var.project_id + slurm_cluster_name = var.slurm_cluster_name + bucket_path = local.bucket_path + enable_debug_logging = var.enable_debug_logging + extra_logging_flags = var.extra_logging_flags + + # storage + disable_default_mounts = var.disable_default_mounts + network_storage = var.network_storage + login_network_storage = var.enable_hybrid ? null : var.login_network_storage + + # timeouts + controller_startup_scripts_timeout = var.enable_hybrid ? null : var.controller_startup_scripts_timeout + compute_startup_scripts_timeout = var.compute_startup_scripts_timeout + login_startup_scripts_timeout = var.enable_hybrid ? null : var.login_startup_scripts_timeout + munge_mount = local.munge_mount + + # slurm conf + prolog_scripts = [for k, v in google_storage_bucket_object.prolog_scripts : k] + epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] + cloud_parameters = var.cloud_parameters + partitions = local.partitions + nodeset = local.nodeset + nodeset_dyn = local.nodeset_dyn + nodeset_tpu = local.nodeset_tpu + + # hybrid + hybrid = var.enable_hybrid + google_app_cred_path = var.enable_hybrid ? local.google_app_cred_path : null + output_dir = var.enable_hybrid ? local.output_dir : null + install_dir = var.enable_hybrid ? local.install_dir : null + slurm_control_host = var.enable_hybrid ? var.slurm_control_host : null + slurm_control_host_port = var.enable_hybrid ? local.slurm_control_host_port : null + slurm_control_addr = var.enable_hybrid ? var.slurm_control_addr : null + slurm_bin_dir = var.enable_hybrid ? local.slurm_bin_dir : null + slurm_log_dir = var.enable_hybrid ? local.slurm_log_dir : null + + # config files templates + slurmdbd_conf_tpl = file(coalesce(var.slurmdbd_conf_tpl, "${local.etc_dir}/slurmdbd.conf.tpl")) + slurm_conf_tpl = file(coalesce(var.slurm_conf_tpl, "${local.etc_dir}/slurm.conf.tpl")) + cgroup_conf_tpl = file(coalesce(var.cgroup_conf_tpl, "${local.etc_dir}/cgroup.conf.tpl")) + jobsubmit_lua_tpl = file(coalesce(var.job_submit_lua_tpl, "${local.etc_dir}/job_submit.lua.tpl")) + + # Providers + endpoint_versions = var.endpoint_versions + } + + config_yaml = "config.yaml" + config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) + + partitions = { for p in var.partitions[*].partition : p.partition_name => p } + + nodeset = { for n in var.nodeset[*].nodeset : n.nodeset_name => n } + nodeset_dyn = { for n in var.nodeset_dyn[*].nodeset : n.nodeset_name => n } + nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } + + x_nodeset = toset([for k, v in local.nodeset : v.nodeset_name]) + x_nodeset_dyn = toset([for k, v in local.nodeset_dyn : v.nodeset_name]) + x_nodeset_tpu = toset([for k, v in local.nodeset_tpu : v.nodeset_name]) + x_nodeset_overlap = setintersection([], local.x_nodeset, local.x_nodeset_dyn, local.x_nodeset_tpu) + + etc_dir = abspath("${path.module}/../../../../etc") + + bucket_path = format("%s/%s", data.google_storage_bucket.this.url, local.bucket_dir) + + slurm_control_host_port = coalesce(var.slurm_control_host_port, "6818") + + google_app_cred_path = var.google_app_cred_path != null ? abspath(var.google_app_cred_path) : null + slurm_bin_dir = var.slurm_bin_dir != null ? abspath(var.slurm_bin_dir) : null + slurm_log_dir = var.slurm_log_dir != null ? abspath(var.slurm_log_dir) : null + + munge_mount = var.enable_hybrid ? { + server_ip = lookup(var.munge_mount, "server_ip", coalesce(var.slurm_control_addr, var.slurm_control_host)) + remote_mount = lookup(var.munge_mount, "remote_mount", "/etc/munge/") + fs_type = lookup(var.munge_mount, "fs_type", "nfs") + mount_options = lookup(var.munge_mount, "mount_options", "") + } : null + + output_dir = can(coalesce(var.output_dir)) ? abspath(var.output_dir) : abspath(".") + install_dir = can(coalesce(var.install_dir)) ? abspath(var.install_dir) : local.output_dir +} + +resource "google_storage_bucket_object" "config" { + bucket = data.google_storage_bucket.this.name + name = local.config_yaml_bucket + content = yamlencode(local.config) +} + +######### +# DEVEL # +######### + +locals { + build_dir = abspath("${path.module}/../../../../build") + + slurm_gcp_devel_zip = "slurm-gcp-devel.zip" + slurm_gcp_devel_zip_bucket = format("%s/%s", local.bucket_dir, local.slurm_gcp_devel_zip) +} + +data "archive_file" "slurm_gcp_devel_zip" { + count = var.enable_devel ? 1 : 0 + + output_path = "${local.build_dir}/${local.slurm_gcp_devel_zip}" + type = "zip" + source_dir = local.scripts_dir + + excludes = flatten([ + "config.yaml", + "Pipfile", + fileset(local.scripts_dir, "__pycache__/*"), + fileset(local.scripts_dir, "*.log"), + fileset(local.scripts_dir, "*.cache"), + fileset(local.scripts_dir, "*.lock"), + ]) + +} + +resource "google_storage_bucket_object" "devel" { + count = var.enable_devel ? 1 : 0 + + bucket = var.bucket_name + name = local.slurm_gcp_devel_zip_bucket + source = data.archive_file.slurm_gcp_devel_zip[0].output_path +} + + +########### +# SCRIPTS # +########### + +resource "google_storage_bucket_object" "controller_startup_scripts" { + for_each = { + for x in local.controller_startup_scripts + : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x + } + + bucket = var.bucket_name + name = format("%s/slurm-controller-script-%s", local.bucket_dir, each.key) + content = each.value.content +} + +resource "google_storage_bucket_object" "compute_startup_scripts" { + for_each = { + for x in var.compute_startup_scripts + : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x + } + + bucket = var.bucket_name + name = format("%s/slurm-compute-script-%s", local.bucket_dir, each.key) + content = each.value.content +} + +resource "google_storage_bucket_object" "nodeset_startup_scripts" { + for_each = { for x in flatten([ + for nodeset, scripts in var.nodeset_startup_scripts + : [for s in scripts + : { + content = s.content, + name = format("slurm-nodeset-%s-script-%s", nodeset, replace(basename(s.filename), "/[^a-zA-Z0-9-_]/", "_")) } + ]]) : x.name => x.content } + + bucket = var.bucket_name + name = format("%s/%s", local.bucket_dir, each.key) + content = each.value +} + +resource "google_storage_bucket_object" "login_startup_scripts" { + for_each = { + for x in var.login_startup_scripts + : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x + } + + bucket = var.bucket_name + name = format("%s/slurm-login-script-%s", local.bucket_dir, each.key) + content = each.value.content +} + +resource "google_storage_bucket_object" "prolog_scripts" { + for_each = { + for x in local.prolog_scripts + : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x + } + + bucket = var.bucket_name + name = format("%s/slurm-prolog-script-%s", local.bucket_dir, each.key) + content = each.value.content + source = each.value.source +} + +resource "google_storage_bucket_object" "epilog_scripts" { + for_each = { + for x in local.epilog_scripts + : replace(basename(x.filename), "/[^a-zA-Z0-9-_]/", "_") => x + } + + bucket = var.bucket_name + name = format("%s/slurm-epilog-script-%s", local.bucket_dir, each.key) + content = each.value.content + source = each.value.source +} + +################################ +# DATA: EXTERNAL PROLOG/EPILOG # +################################ + +data "local_file" "external_epilog" { + filename = "${path.module}/files/external_epilog.sh" +} + +data "local_file" "external_prolog" { + filename = "${path.module}/files/external_prolog.sh" +} + +data "local_file" "setup_external" { + filename = "${path.module}/files/setup_external.sh" +} + +locals { + checksum = md5(join("", flatten([ + google_storage_bucket_object.config.md5hash, + [for f in google_storage_bucket_object.devel : f.md5hash], + [for k, f in google_storage_bucket_object.controller_startup_scripts : f.md5hash], + [for k, f in google_storage_bucket_object.compute_startup_scripts : f.md5hash], + [for k, f in google_storage_bucket_object.nodeset_startup_scripts : f.md5hash], + [for k, f in google_storage_bucket_object.login_startup_scripts : f.md5hash], + [for k, f in google_storage_bucket_object.prolog_scripts : f.md5hash], + [for k, f in google_storage_bucket_object.epilog_scripts : f.md5hash] + ]))) + + external_epilog = [{ + filename = "z_external_epilog.sh" + content = data.local_file.external_epilog.content + source = null + }] + external_prolog = [{ + filename = "z_external_prolog.sh" + content = data.local_file.external_prolog.content + source = null + }] + setup_external = [{ + filename = "z_setup_external.sh" + content = data.local_file.setup_external.content + }] + + prolog_scripts = var.enable_external_prolog_epilog ? concat(local.external_prolog, var.prolog_scripts) : var.prolog_scripts + epilog_scripts = var.enable_external_prolog_epilog ? concat(local.external_epilog, var.epilog_scripts) : var.epilog_scripts + controller_startup_scripts = var.enable_external_prolog_epilog ? concat(local.setup_external, var.controller_startup_scripts) : var.controller_startup_scripts + + +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf new file mode 100644 index 0000000000..3b680b50a7 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf @@ -0,0 +1,60 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "slurm_bucket_path" { + description = "GCS Bucket URI of Slurm cluster file storage." + value = local.bucket_path +} + +output "config" { + description = "Cluster configuration." + value = local.config + + precondition { + condition = var.enable_hybrid ? can(coalesce(var.slurm_control_host)) : true + error_message = "Input slurm_control_host is required." + } + + precondition { + condition = length(local.x_nodeset_overlap) == 0 + error_message = "All nodeset names must be unique among all nodeset types." + } +} + +output "partitions" { + description = "Cluster partitions." + value = lookup(local.config, "partitions", null) +} + +output "nodeset" { + description = "Cluster nodesets." + value = lookup(local.config, "nodeset", null) +} + +output "nodeset_dyn" { + description = "Cluster nodesets (dynamic)." + value = lookup(local.config, "nodeset_dyn", null) +} + +output "nodeset_tpu" { + description = "Cluster nodesets (TPU)." + value = lookup(local.config, "nodeset_tpu", null) +} + +output "checksum" { + description = "Checksum of all files written to the bucket." + value = local.checksum +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py new file mode 100755 index 0000000000..0dd81d2923 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Iterable, Dict +from itertools import chain +from collections import defaultdict +import json +from pathlib import Path +import util +from util import dirs, slurmdirs + +FILE_PREAMBLE = """ +# Warning: +# This file is managed by a script. Manual modifications will be overwritten. +""" + +login_nodeset = "x-login" + + +def dict_to_conf(conf, delim=" ") -> str: + """convert dict to delimited slurm-style key-value pairs""" + + def filter_conf(pair): + k, v = pair + if isinstance(v, list): + v = ",".join(el for el in v if el is not None) + return k, (v if bool(v) or v == 0 else None) + + return delim.join( + f"{k}={v}" for k, v in map(filter_conf, conf.items()) if v is not None + ) + + +def conflines(cloud_parameters, lkp: util.Lookup) -> str: + scripts_dir = lkp.cfg.install_dir or dirs.scripts + no_comma_params = cloud_parameters.no_comma_params or False + + any_gpus = any( + lkp.template_info(nodeset.instance_template).gpu_count > 0 + for nodeset in lkp.cfg.nodeset.values() + ) + + any_tpu = any( + tpu_nodeset is not None + for part in lkp.cfg.partitions.values() + for tpu_nodeset in part.partition_nodeset_tpu + ) + + any_dynamic = any(bool(p.partition_feature) for p in lkp.cfg.partitions.values()) + comma_params = { + "PrivateData": [ + "cloud", + ], + "LaunchParameters": [ + "enable_nss_slurm", + "use_interactive_step", + ], + "SlurmctldParameters": [ + "cloud_reg_addrs" if any_dynamic or any_tpu else "cloud_dns", + "enable_configless", + "idle_on_node_suspend", + ], + "SchedulerParameters": [ + "bf_continue", + "salloc_wait_nodes", + "ignore_prefer_validation", + ], + "GresTypes": [ + "gpu" if any_gpus else None, + ], + } + prolog_path = Path(dirs.custom_scripts / "prolog.d") + epilog_path = Path(dirs.custom_scripts / "epilog.d") + default_tree_width = 65533 if any_dynamic else None + conf_options = { + **(comma_params if not no_comma_params else {}), + "Prolog": f"{prolog_path}/*" if lkp.cfg.prolog_scripts else None, + "Epilog": f"{epilog_path}/*" if lkp.cfg.epilog_scripts else None, + "SuspendProgram": f"{scripts_dir}/suspend.py", + "ResumeProgram": f"{scripts_dir}/resume.py", + "ResumeFailProgram": f"{scripts_dir}/suspend.py", + "ResumeRate": cloud_parameters.get("resume_rate", 0), + "ResumeTimeout": cloud_parameters.get("resume_timeout", 300), + "SuspendRate": cloud_parameters.get("suspend_rate", 0), + "SuspendTimeout": cloud_parameters.get("suspend_timeout", 300), + "TreeWidth": cloud_parameters.get("tree_width", default_tree_width), + "JobSubmitPlugins": "lua" if any_tpu else None, + "TopologyPlugin": cloud_parameters.get("topology_plugin", "topology/tree"), + } + return dict_to_conf(conf_options, delim="\n") + + +def loginlines() -> str: + nodeset = { + "NodeSet": login_nodeset, + "Feature": login_nodeset, + } + partition = { + "PartitionName": login_nodeset, + "Nodes": login_nodeset, + "State": "UP", + "DefMemPerCPU": 1, + "Hidden": "YES", + "RootOnly": "YES", + } + lines = [ + dict_to_conf(nodeset), + dict_to_conf(partition), + ] + return "\n".join(lines) + + +def nodeset_lines(nodeset, lkp: util.Lookup) -> str: + template_info = lkp.template_info(nodeset.instance_template) + machine_conf = lkp.template_machine_conf(nodeset.instance_template) + + # follow https://slurm.schedmd.com/slurm.conf.html#OPT_Boards + # by setting Boards, SocketsPerBoard, CoresPerSocket, and ThreadsPerCore + node_def = { + "NodeName": "DEFAULT", + "State": "UNKNOWN", + "RealMemory": machine_conf.memory, + "Boards": machine_conf.boards, + "SocketsPerBoard": machine_conf.sockets_per_board, + "CoresPerSocket": machine_conf.cores_per_socket, + "ThreadsPerCore": machine_conf.threads_per_core, + "CPUs": machine_conf.cpus, + **nodeset.node_conf, + } + + gres = f"gpu:{template_info.gpu_count}" if template_info.gpu_count else None + nodelist = lkp.nodelist(nodeset) + + return "\n".join( + map( + dict_to_conf, + [ + node_def, + {"NodeName": nodelist, "State": "CLOUD", "Gres": gres}, + {"NodeSet": nodeset.nodeset_name, "Nodes": nodelist}, + ], + ) + ) + + +def nodeset_tpu_lines(nodeset, lkp: util.Lookup) -> str: + node_def = { + "NodeName": "DEFAULT", + "State": "UNKNOWN", + **nodeset.node_conf, + } + nodelist = lkp.nodelist(nodeset) + + return "\n".join( + map( + dict_to_conf, + [ + node_def, + {"NodeName": nodelist, "State": "CLOUD"}, + {"NodeSet": nodeset.nodeset_name, "Nodes": nodelist}, + ], + ) + ) + + +def nodeset_dyn_lines(nodeset): + """generate slurm NodeSet definition for dynamic nodeset""" + return dict_to_conf( + {"NodeSet": nodeset.nodeset_name, "Feature": nodeset.nodeset_feature} + ) + + +def partitionlines(partition, lkp: util.Lookup) -> str: + """Make a partition line for the slurm.conf""" + MIN_MEM_PER_CPU = 100 + + def defmempercpu(nodeset: str) -> int: + template = lkp.cfg.nodeset.get(nodeset).instance_template + machine = lkp.template_machine_conf(template) + return max(MIN_MEM_PER_CPU, machine.memory // machine.cpus) + + defmem = min( + map(defmempercpu, partition.partition_nodeset), default=MIN_MEM_PER_CPU + ) + + nodesets = list( + chain( + partition.partition_nodeset, + partition.partition_nodeset_dyn, + partition.partition_nodeset_tpu, + ) + ) + + is_tpu = len(partition.partition_nodeset_tpu) > 0 + is_dyn = len(partition.partition_nodeset_dyn) > 0 + + oversub_exlusive = partition.enable_job_exclusive or is_tpu + power_down_on_idle = partition.enable_job_exclusive and not is_dyn + + line_elements = { + "PartitionName": partition.partition_name, + "Nodes": ",".join(nodesets), + "State": "UP", + "DefMemPerCPU": defmem, + "SuspendTime": 300, + "Oversubscribe": "Exclusive" if oversub_exlusive else None, + "PowerDownOnIdle": "YES" if power_down_on_idle else None, + **partition.partition_conf, + } + + return dict_to_conf(line_elements) + + +def suspend_exc_lines(lkp: util.Lookup) -> Iterable[str]: + static_nodelists = [] + for ns in lkp.power_managed_nodesets(): + if ns.node_count_static: + nodelist = lkp.nodelist_range(ns.nodeset_name, 0, ns.node_count_static) + static_nodelists.append(nodelist) + suspend_exc_nodes = {"SuspendExcNodes": static_nodelists} + + dyn_parts = [ + p.partition_name + for p in lkp.cfg.partitions.values() + if len(p.partition_nodeset_dyn) > 0 + ] + suspend_exc_parts = {"SuspendExcParts": [login_nodeset, *dyn_parts]} + + return filter( + None, + [ + dict_to_conf(suspend_exc_nodes) if static_nodelists else None, + dict_to_conf(suspend_exc_parts), + ], + ) + + +def make_cloud_conf(lkp: util.Lookup) -> str: + """generate cloud.conf snippet""" + lines = [ + FILE_PREAMBLE, + conflines(lkp.cfg.cloud_parameters, lkp), + loginlines(), + *(nodeset_lines(n, lkp) for n in lkp.cfg.nodeset.values()), + *(nodeset_dyn_lines(n) for n in lkp.cfg.nodeset_dyn.values()), + *(nodeset_tpu_lines(n, lkp) for n in lkp.cfg.nodeset_tpu.values()), + *(partitionlines(p, lkp) for p in lkp.cfg.partitions.values()), + *(suspend_exc_lines(lkp)), + ] + return "\n\n".join(filter(None, lines)) + + +def gen_cloud_conf(lkp: util.Lookup) -> None: + content = make_cloud_conf(lkp) + + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud.conf" + conf_file.write_text(content) + util.chown_slurm(conf_file, mode=0o644) + + +def install_slurm_conf(lkp: util.Lookup) -> None: + """install slurm.conf""" + if lkp.cfg.ompi_version: + mpi_default = "pmi2" + else: + mpi_default = "none" + + conf_options = { + "name": lkp.cfg.slurm_cluster_name, + "control_addr": lkp.control_addr if lkp.control_addr else lkp.hostname_fqdn, + "control_host": lkp.control_host, + "control_host_port": lkp.control_host_port, + "scripts": dirs.scripts, + "slurmlog": dirs.log, + "state_save": slurmdirs.state, + "mpi_default": mpi_default, + } + + conf = lkp.cfg.slurm_conf_tpl.format(**conf_options) + + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurm.conf" + conf_file.write_text(conf) + util.chown_slurm(conf_file, mode=0o644) + + +def install_slurmdbd_conf(lkp: util.Lookup) -> None: + """install slurmdbd.conf""" + conf_options = { + "control_host": lkp.control_host, + "slurmlog": dirs.log, + "state_save": slurmdirs.state, + "db_name": "slurm_acct_db", + "db_user": "slurm", + "db_pass": '""', + "db_host": "localhost", + "db_port": "3306", + } + + if lkp.cfg.cloudsql_secret: + secret_name = f"{lkp.cfg.slurm_cluster_name}-slurm-secret-cloudsql" + payload = json.loads(util.access_secret_version(lkp.project, secret_name)) + + if payload["db_name"] and payload["db_name"] != "": + conf_options["db_name"] = payload["db_name"] + if payload["user"] and payload["user"] != "": + conf_options["db_user"] = payload["user"] + if payload["password"] and payload["password"] != "": + conf_options["db_pass"] = payload["password"] + + db_host_str = payload["server_ip"].split(":") + if db_host_str[0]: + conf_options["db_host"] = db_host_str[0] + conf_options["db_port"] = ( + db_host_str[1] if len(db_host_str) >= 2 else "3306" + ) + + conf = lkp.cfg.slurmdbd_conf_tpl.format(**conf_options) + + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurmdbd.conf" + conf_file.write_text(conf) + util.chown_slurm(conf_file, 0o600) + + +def install_cgroup_conf(lkp: util.Lookup) -> None: + """install cgroup.conf""" + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cgroup.conf" + conf_file.write_text(lkp.cfg.cgroup_conf_tpl) + util.chown_slurm(conf_file, mode=0o600) + + +def install_jobsubmit_lua(lkp: util.Lookup) -> None: + """install job_submit.lua if there are tpu nodes in the cluster""" + if any( + tpu_nodeset is not None + for part in lkp.cfg.partitions.values() + for tpu_nodeset in part.partition_nodeset_tpu + ): + conf_options = { + "scripts_dir": lkp.cfg.slurm_scripts_dir or dirs.scripts, + } + conf = lkp.cfg.jobsubmit_lua_tpl.format(**conf_options) + + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "job_submit.lua" + conf_file.write_text(conf) + util.chown_slurm(conf_file, 0o600) + + +def gen_cloud_gres_conf(lkp: util.Lookup) -> None: + """generate cloud_gres.conf""" + + gpu_nodes = defaultdict(list) + for nodeset in lkp.cfg.nodeset.values(): + template_info = lkp.template_info(nodeset.instance_template) + gpu_count = template_info.gpu_count + if gpu_count == 0: + continue + gpu_nodes[gpu_count].append(lkp.nodelist(nodeset)) + + lines = [ + dict_to_conf( + { + "NodeName": names, + "Name": "gpu", + "File": "/dev/nvidia{}".format(f"[0-{i-1}]" if i > 1 else "0"), + } + ) + for i, names in gpu_nodes.items() + ] + lines.append("\n") + content = FILE_PREAMBLE + "\n".join(lines) + + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" + conf_file.write_text(content) + util.chown_slurm(conf_file, mode=0o600) + + +def install_gres_conf(lkp: util.Lookup) -> None: + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" + gres_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "gres.conf" + if not gres_conf.exists(): + gres_conf.symlink_to(conf_file) + util.chown_slurm(gres_conf, mode=0o600) + + +class Switch: + """ + Represents a switch in the topology.conf file. + NOTE: It's class user job to make sure that there is no leaf-less Switches in the tree + """ + + def __init__( + self, + name: str, + nodes: Optional[Iterable[str]] = None, + switches: Optional[Dict[str, "Switch"]] = None, + ): + self.name = name + self.nodes = nodes or [] + self.switches = switches or {} + + def conf_line(self) -> str: + d = {"SwitchName": self.name} + if self.nodes: + d["Nodes"] = util.to_hostlist_fast(self.nodes) + if self.switches: + d["Switches"] = util.to_hostlist_fast(self.switches.keys()) + return dict_to_conf(d) + + def render_conf_lines(self) -> Iterable[str]: + yield self.conf_line() + for s in sorted(self.switches.values(), key=lambda s: s.name): + yield from s.render_conf_lines() + + +class TopologyBuilder: + def __init__(self) -> None: + self._r = Switch("root") + + def add(self, path: List[str], nodes: Iterable[str]) -> None: + n = self._r + assert path + for p in path: + n = n.switches.setdefault(p, Switch(p)) + n.nodes = chain(n.nodes, nodes) + + def render_conf_lines(self) -> Iterable[str]: + if not self._r.switches: + return [] + for s in sorted(self._r.switches.values(), key=lambda s: s.name): + yield from s.render_conf_lines() + + +def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup): + tpuobj = util.TPU(nodeset) + static, dynamic = lkp.nodenames(nodeset) + + pref = ["nodeset_tpu-root", nodeset.nodeset_name] + if tpuobj.vmcount == 1: # Put all nodes in one switch + bldr.add(pref, list(chain(static, dynamic))) + return + + # Chunk nodes into sub-switches of size `vmcount` + chunk_num = 0 + for nodenames in (static, dynamic): + for nodeschunk in util.chunked(nodenames, n=tpuobj.vmcount): + chunk_name = f"{nodeset.nodeset_name}-{chunk_num}" + chunk_num += 1 + bldr.add([*pref, chunk_name], list(nodeschunk)) + + +def add_nodeset_topology( + nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup +) -> None: + path = ["nodeset-root", nodeset.nodeset_name] + nodes = list(chain(*lkp.nodenames(nodeset))) + bldr.add(path, nodes) + + +def gen_topology(lkp: util.Lookup) -> TopologyBuilder: + bldr = TopologyBuilder() + for ns in lkp.cfg.nodeset_tpu.values(): + add_tpu_nodeset_topology(ns, bldr, lkp) + for ns in lkp.cfg.nodeset.values(): + add_nodeset_topology(ns, bldr, lkp) + return bldr + + +def gen_topology_conf(lkp: util.Lookup) -> None: + """generate slurm topology.conf from config.yaml""" + bldr = gen_topology(lkp) + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_topology.conf" + with open(conf_file, "w") as f: + f.writelines(FILE_PREAMBLE + "\n") + for line in bldr.render_conf_lines(): + f.write(line) + f.write("\n") + f.write("\n") + util.chown_slurm(conf_file, mode=0o600) + + +def install_topology_conf(lkp: util.Lookup) -> None: + conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_topology.conf" + topo_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "topology.conf" + if not topo_conf.exists(): + topo_conf.symlink_to(conf_file) + util.chown_slurm(conf_file, mode=0o600) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py new file mode 100755 index 0000000000..70dfa04d81 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 + +import argparse +import os +import shelve +import uuid +from collections import namedtuple +from datetime import datetime, timezone, timedelta +from pathlib import Path +from pprint import pprint + +from google.cloud.bigquery import SchemaField +from google.cloud import bigquery as bq +from google.api_core import retry, exceptions + +import util +from util import run +from util import cfg + + +SACCT = "sacct" +script = Path(__file__).resolve() + +DEFAULT_TIMESTAMP_FILE = script.parent / "bq_timestamp" +timestamp_file = Path(os.environ.get("TIMESTAMP_FILE", DEFAULT_TIMESTAMP_FILE)) + +# cluster_id_file = script.parent / 'cluster_uuid' +# try: +# cluster_id = cluster_id_file.read_text().rstrip() +# except FileNotFoundError: +# cluster_id = uuid.uuid4().hex +# cluster_id_file.write_text(cluster_id) + +job_idx_cache_path = script.parent / "bq_job_idx_cache" + +SLURM_TIME_FORMAT = r"%Y-%m-%dT%H:%M:%S" + + +def make_datetime(time_string): + return datetime.strptime(time_string, SLURM_TIME_FORMAT).replace( + tzinfo=timezone.utc + ) + + +def make_time_interval(seconds): + sign = 1 + if seconds < 0: + sign = -1 + seconds = abs(seconds) + d, r = divmod(seconds, 60 * 60 * 24) + h, r = divmod(r, 60 * 60) + m, s = divmod(r, 60) + d *= sign + h *= sign + return f"{d}D {h:02}:{m:02}:{s}" + + +converters = { + "DATETIME": make_datetime, + "INTERVAL": make_time_interval, + "STRING": str, + "INT64": lambda n: int(n or 0), +} + + +def schema_field(field_name, data_type, description, required=False): + return SchemaField( + field_name, + data_type, + description=description, + mode="REQUIRED" if required else "NULLABLE", + ) + + +schema_fields = [ + schema_field("cluster_name", "STRING", "cluster name", required=True), + schema_field("cluster_id", "STRING", "UUID for the cluster", required=True), + schema_field("entry_uuid", "STRING", "entry UUID for the job row", required=True), + schema_field( + "job_db_uuid", "INT64", "job db index from the slurm database", required=True + ), + schema_field("job_id_raw", "INT64", "raw job id", required=True), + schema_field("job_id", "STRING", "job id", required=True), + schema_field("state", "STRING", "final job state", required=True), + schema_field("job_name", "STRING", "job name"), + schema_field("partition", "STRING", "job partition"), + schema_field("submit_time", "DATETIME", "job submit time"), + schema_field("start_time", "DATETIME", "job start time"), + schema_field("end_time", "DATETIME", "job end time"), + schema_field("elapsed_raw", "INT64", "STRING", "job run time in seconds"), + # schema_field("elapsed_time", "INTERVAL", "STRING", "job run time interval"), + schema_field("timelimit_raw", "STRING", "job timelimit in minutes"), + schema_field("timelimit", "STRING", "job timelimit"), + # schema_field("num_tasks", "INT64", "number of allocated tasks in job"), + schema_field("nodelist", "STRING", "names of nodes allocated to job"), + schema_field("user", "STRING", "user responsible for job"), + schema_field("uid", "INT64", "uid of job user"), + schema_field("group", "STRING", "group of job user"), + schema_field("gid", "INT64", "gid of job user"), + schema_field("wckey", "STRING", "job wckey"), + schema_field("qos", "STRING", "job qos"), + schema_field("comment", "STRING", "job comment"), + schema_field("admin_comment", "STRING", "job admin comment"), + # extra will be added in 23.02 + # schema_field("extra", "STRING", "job extra field"), + schema_field("exitcode", "STRING", "job exit code"), + schema_field("alloc_cpus", "INT64", "count of allocated CPUs"), + schema_field("alloc_nodes", "INT64", "number of nodes allocated to job"), + schema_field("alloc_tres", "STRING", "allocated trackable resources (TRES)"), + # schema_field("system_cpu", "INTERVAL", "cpu time used by parent processes"), + # schema_field("cpu_time", "INTERVAL", "CPU time used (elapsed * cpu count)"), + schema_field("cpu_time_raw", "INT64", "CPU time used (elapsed * cpu count)"), + # schema_field("ave_cpu", "INT64", "Average CPU time of all tasks in job"), + # schema_field( + # "tres_usage_tot", + # "STRING", + # "Tres total usage by all tasks in job", + # ), +] + + +slurm_field_map = { + "job_db_uuid": "DBIndex", + "job_id_raw": "JobIDRaw", + "job_id": "JobID", + "state": "State", + "job_name": "JobName", + "partition": "Partition", + "submit_time": "Submit", + "start_time": "Start", + "end_time": "End", + "elapsed_raw": "ElapsedRaw", + "elapsed_time": "Elapsed", + "timelimit_raw": "TimelimitRaw", + "timelimit": "Timelimit", + "num_tasks": "NTasks", + "nodelist": "Nodelist", + "user": "User", + "uid": "Uid", + "group": "Group", + "gid": "Gid", + "wckey": "Wckey", + "qos": "Qos", + "comment": "Comment", + "admin_comment": "AdminComment", + # "extra": "Extra", + "exit_code": "ExitCode", + "alloc_cpus": "AllocCPUs", + "alloc_nodes": "AllocNodes", + "alloc_tres": "AllocTres", + "system_cpu": "SystemCPU", + "cpu_time": "CPUTime", + "cpu_time_raw": "CPUTimeRaw", + "ave_cpu": "AveCPU", + "tres_usage_tot": "TresUsageInTot", +} + +# new field name is the key for job_schema. Used to lookup the datatype when +# creating the job rows +job_schema = {field.name: field for field in schema_fields} +# Order is important here, as that is how they are parsed from sacct output +Job = namedtuple("Job", job_schema.keys()) + +client = bq.Client( + project=cfg.project, + credentials=util.default_credentials(), + client_options=util.create_client_options(util.ApiEndpoint.BQ), +) +dataset_id = f"{cfg.slurm_cluster_name}_job_data" +dataset = bq.DatasetReference(project=cfg.project, dataset_id=dataset_id) +table = bq.Table( + bq.TableReference(dataset, f"jobs_{cfg.slurm_cluster_name}"), schema_fields +) + + +class JobInsertionFailed(Exception): + pass + + +def make_job_row(job): + job_row = { + field_name: dict.get(converters, field.field_type)(job[field_name]) + for field_name, field in job_schema.items() + if field_name in job + } + job_row["entry_uuid"] = uuid.uuid4().hex + job_row["cluster_id"] = cfg.cluster_id + job_row["cluster_name"] = cfg.slurm_cluster_name + return job_row + + +def load_slurm_jobs(start, end): + states = ",".join( + ( + "BOOT_FAIL", + "CANCELLED", + "COMPLETED", + "DEADLINE", + "FAILED", + "NODE_FAIL", + "OUT_OF_MEMORY", + "PREEMPTED", + "REQUEUED", + "REVOKED", + "TIMEOUT", + ) + ) + start_iso = start.isoformat(timespec="seconds") + end_iso = end.isoformat(timespec="seconds") + # slurm_fields and bq_fields will be in matching order + slurm_fields = ",".join(slurm_field_map.values()) + bq_fields = slurm_field_map.keys() + cmd = ( + f"{SACCT} --start {start_iso} --end {end_iso} -X -D --format={slurm_fields} " + f"--state={states} --parsable2 --noheader --allusers --duplicates" + ) + text = run(cmd).stdout.splitlines() + # zip pairs bq_fields with the value from sacct + jobs = [dict(zip(bq_fields, line.split("|"))) for line in text] + + # The job index cache allows us to avoid sending duplicate jobs. This avoids a race condition with updating the database. + with shelve.open(str(job_idx_cache_path), flag="r") as job_idx_cache: + job_rows = [ + make_job_row(job) + for job in jobs + if str(job["job_db_uuid"]) not in job_idx_cache + ] + return job_rows + + +def init_table(): + global dataset + global table + dataset = client.create_dataset(dataset, exists_ok=True) + table = client.create_table(table, exists_ok=True) + until_found = retry.Retry(predicate=retry.if_exception_type(exceptions.NotFound)) + table = client.get_table(table, retry=until_found) + # cannot add required fields to an existing schema + table.schema = schema_fields + table = client.update_table(table, ["schema"]) + + +def purge_job_idx_cache(): + purge_time = datetime.now() - timedelta(minutes=30) + with shelve.open(str(job_idx_cache_path), writeback=True) as cache: + to_delete = [] + for idx, stamp in cache.items(): + if stamp < purge_time: + to_delete.append(idx) + for idx in to_delete: + del cache[idx] + + +def bq_submit(jobs): + try: + result = client.insert_rows(table, jobs) + except exceptions.NotFound as e: + print(f"failed to upload job data, table not yet found: {e}") + raise e + except Exception as e: + print(f"failed to upload job data: {e}") + raise e + if result: + pprint(jobs) + pprint(result) + raise JobInsertionFailed("failed to upload job data to big query") + print(f"successfully loaded {len(jobs)} jobs") + + +def get_time_window(): + if not timestamp_file.is_file(): + timestamp_file.touch() + try: + timestamp = datetime.strptime( + timestamp_file.read_text().rstrip(), SLURM_TIME_FORMAT + ) + # time window will overlap the previous by 10 minutes. Duplicates will be filtered out by the job_idx_cache + start = timestamp - timedelta(minutes=10) + except ValueError: + # timestamp 1 is 1 second after the epoch; timestamp 0 is special for sacct + start = datetime.fromtimestamp(1) + # end is now() truncated to the last second + end = datetime.now().replace(microsecond=0) + return start, end + + +def write_timestamp(time): + timestamp_file.write_text(time.isoformat(timespec="seconds")) + + +def update_job_idx_cache(jobs, timestamp): + with shelve.open(str(job_idx_cache_path), writeback=True) as job_idx_cache: + for job in jobs: + job_idx = str(job["job_db_uuid"]) + job_idx_cache[job_idx] = timestamp + + +def main(): + if not cfg.enable_bigquery_load: + print("bigquery load is not currently enabled") + exit(0) + init_table() + + start, end = get_time_window() + jobs = load_slurm_jobs(start, end) + # on failure, an exception will cause the timestamp not to be rewritten. So + # it will try again next time. If some writes succeed, we don't currently + # have a way to not submit duplicates next time. + if jobs: + bq_submit(jobs) + write_timestamp(end) + update_job_idx_cache(jobs, end) + + +parser = argparse.ArgumentParser(description="submit slurm job data to big query") +parser.add_argument( + "timestamp_file", + nargs="?", + action="store", + type=Path, + help="specify timestamp file for reading and writing the time window start. Precedence over TIMESTAMP_FILE env var.", +) + +purge_job_idx_cache() +if __name__ == "__main__": + args = parser.parse_args() + if args.timestamp_file: + timestamp_file = args.timestamp_file.resolve() + main() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py new file mode 100755 index 0000000000..f515d52c8a --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -0,0 +1,709 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +import argparse +import collections +import json +import logging +import os +import sys +import yaml +from itertools import chain +from pathlib import Path + +import util +from util import ( + chunked, + dirs, + ensure_execute, + execute_with_futures, + get_insert_operations, + log_api_request, + map_with_futures, + run, + separate, + to_hostlist, + to_hostlist_fast, + trim_self_link, + wait_for_operation, +) +from util import cfg, lkp, NSDict, TPU + +# from util import cfg, lkp, NSDict +import slurm_gcp_plugins + + +filename = Path(__file__).name +LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") + +log = logging.getLogger(filename) + + +global_resume_data = None + +PLACEMENT_MAX_CNT = 150 +# Placement group needs to be the same for an entire bulk_insert hence +# if placement is used the actual BULK_INSERT_LIMIT will be +# max([1000, PLACEMENT_MAX_CNT]) +BULK_INSERT_LIMIT = 5000 + + +def instance_properties(nodeset, model, placement_group, labels=None): + template = lkp.node_template(model) + template_info = lkp.template_info(template) + + props = NSDict() + + slurm_metadata = { + "slurm_cluster_name": cfg.slurm_cluster_name, + "slurm_instance_role": "compute", + "startup-script": ( + Path(cfg.slurm_scripts_dir or util.dirs.scripts) / "startup.sh" + ).read_text(), + "VmDnsSetting": "GlobalOnly", + } + info_metadata = { + item.get("key"): item.get("value") for item in template_info.metadata["items"] + } + + props_metadata = {**info_metadata, **slurm_metadata} + props.metadata = { + "items": [NSDict({"key": k, "value": v}) for k, v in props_metadata.items()] + } + + labels = { + "slurm_cluster_name": cfg.slurm_cluster_name, + "slurm_instance_role": "compute", + **(labels or {}), + } + props.labels = {**template_info.labels, **labels} + + for disk in template_info.disks: + # do not label local ssd + if ( + "diskType" not in disk.initializeParams + or disk.initializeParams.diskType == "local-ssd" + ): + continue + disk.initializeParams.labels.update(labels) + props.disks = template_info.disks + + if placement_group: + props.scheduling = { + "onHostMaintenance": "TERMINATE", + "automaticRestart": False, + } + props.resourcePolicies = [ + placement_group, + ] + + if nodeset.reservation_name: + reservation_name = nodeset.reservation_name + + zones = list(nodeset.zone_policy_allow or []) + assert len(zones) == 1, "Only single zone is supported if using a reservation" + + reservation = lkp.reservation(reservation_name, zones[0]) + + props.reservationAffinity = { + "consumeReservationType": "SPECIFIC_RESERVATION", + "key": f"compute.{util.universe_domain()}/reservation-name", + "values": [reservation_name], + } + + policies = util.reservation_resource_policies(reservation) + if policies: + props.scheduling = { + "onHostMaintenance": "TERMINATE", + "automaticRestart": False, + } + props.resourcePolicies = policies + log.info( + f"reservation {reservation_name} is being used with policies {props.resourcePolicies}" + ) + else: + props.resourcePolicies = [] + log.info( + f"reservation {reservation_name} is being used without any policies" + ) + + if nodeset.maintenance_interval: + props.scheduling = props.scheduling or {} + props.scheduling["maintenanceInterval"] = nodeset.maintenance_interval + + return props + + +def per_instance_properties(node): + props = NSDict() + # No properties beyond name are supported yet. + + return props + + +def create_instances_request(nodes, partition_name, placement_group, job_id=None): + """Call regionInstances.bulkInsert to create instances""" + assert len(nodes) > 0 + if placement_group: + assert len(nodes) <= min(PLACEMENT_MAX_CNT, BULK_INSERT_LIMIT) + else: + assert len(nodes) <= BULK_INSERT_LIMIT + + # model here indicates any node that can be used to describe the rest + model = next(iter(nodes)) + nodeset = lkp.node_nodeset(model) + template = lkp.node_template(model) + region = lkp.node_region(model) + partition = cfg.partitions[partition_name] + log.debug(f"create_instances_request: {model} placement: {placement_group}") + + body = NSDict() + body.count = len(nodes) + body.minCount = 1 + + # source of instance properties + body.sourceInstanceTemplate = template + + labels = ( + dict(slurm_job_id=job_id) + if job_id is not None and partition.enable_job_exclusive + else None + ) + # overwrites properties across all instances + body.instanceProperties = instance_properties( + nodeset, model, placement_group, labels + ) + + # key is instance name, value overwrites properties + body.perInstanceProperties = {k: per_instance_properties(k) for k in nodes} + + zones = { + **{ + f"zones/{zone}": {"preference": "ALLOW"} + for zone in nodeset.zone_policy_allow or [] + }, + **{ + f"zones/{zone}": {"preference": "DENY"} + for zone in nodeset.zone_policy_deny or [] + }, + } + body.locationPolicy.targetShape = cfg.zone_target_shape or "ANY_SINGLE_ZONE" + if zones: + body.locationPolicy.locations = zones + + if lkp.cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.pre_instance_bulk_insert( + lkp=lkp, + nodes=nodes, + placement_group=placement_group, + request_body=body, + ) + + request = util.compute.regionInstances().bulkInsert( + project=cfg.project, region=region, body=body.to_dict() + ) + + if log.isEnabledFor(logging.DEBUG): + log.debug( + f"new request: endpoint={request.methodId} nodes={to_hostlist_fast(nodes)}" + ) + log_api_request(request) + return request + + +def group_nodes_bulk(nodes, resume_data=None): + """group nodes by job_id, placement_group, node_group, and max bulkInsert size""" + if resume_data is None: + # all nodes will be considered jobless + jobs = {} + else: + jobs = {job.job_id: job for job in resume_data.jobs} + + # expand all job nodelists + for job in jobs.values(): + job.nodelist_alloc = job.nodes_alloc + job.nodes_alloc = util.to_hostnames(job.nodelist_alloc) + job.nodelist_resume = job.nodes_resume + job.nodes_resume = util.to_hostnames(job.nodelist_resume) + job.tpu = util.part_is_tpu(job.partition) + if not job.tpu: + # create placement groups if nodes for job need it + job.placement_groups = create_placement_groups( + node_list=job.nodes_alloc, + job_id=job.job_id, + ) + # placement group assignment is based on all allocated nodes, but we only want to + # handle nodes in nodes_resume in this run. + for pg, pg_nodes in job.placement_groups.items(): + job.placement_groups[pg] = list( + set(pg_nodes).intersection(job.nodes_resume) + ) + # a bit of a hack, but nodes resumed using scontrol instead of through job scheduling do not have a job + jobless_nodes = list( + set(nodes).difference( + chain.from_iterable(job.nodes_resume for job in jobs.values()) + ) + ) + jobless_nodes_tpu = [] + for jobless_node in jobless_nodes[:]: + if lkp.node_is_tpu(jobless_node): + jobless_nodes.remove(jobless_node) + jobless_nodes_tpu.append(jobless_node) + + jobs["Normal_None"] = NSDict( + job_id=None, + nodes_resume=jobless_nodes, + nodes_alloc=jobless_nodes, + placement_groups=create_placement_groups(node_list=jobless_nodes), + partition=None, + tpu=False, + ) + jobs["TPU_None"] = NSDict( + job_id=None, + nodes_resume=jobless_nodes_tpu, + nodes_alloc=jobless_nodes_tpu, + partition=None, + tpu=True, + ) + + BulkChunk = collections.namedtuple( + "BulkChunk", + ["prefix", "job_id", "partition_name", "placement_group", "nodes", "i"], + ) + BulkChunkTPU = collections.namedtuple( + "BulkChunkTPU", + ["prefix", "job_id", "partition_name", "nodes", "i"], + ) + grouped_nodes = [ + BulkChunk( + prefix, + job_id if job_id != "Normal_None" else None, + jobs[job_id].partition, + placement_group, + chunk_nodes, + i, + ) + for job_id, job in jobs.items() + if not job.tpu + for placement_group, pg_nodes in job.placement_groups.items() + for prefix, nodes in util.groupby_unsorted(pg_nodes, lkp.node_prefix) + for i, chunk_nodes in enumerate(chunked(nodes, n=BULK_INSERT_LIMIT)) + ] + grouped_nodes_tpu = [ + BulkChunkTPU( + prefix, + job_id if job_id != "TPU_None" else None, + jobs[job_id].partition, + chunk_nodes, + i, + ) + for job_id, job in jobs.items() + if job.tpu + for prefix, nodes in util.groupby_unsorted(job.nodes_resume, lkp.node_prefix) + for i, chunk_nodes in enumerate(lkp.chunk_tpu_nodes(list(nodes))) + ] + + def group_name(chunk: BulkChunk): + if chunk.placement_group is not None: + return f"{chunk.prefix}:job{chunk.job_id}:{chunk.placement_group}:{chunk.i}" + if chunk.job_id is not None: + return f"{chunk.prefix}:job{chunk.job_id}:{chunk.i}" + return f"{chunk.prefix}:{chunk.i}" + + def group_name_tpu(chunk: BulkChunkTPU): + if chunk.job_id is not None: + return f"{chunk.prefix}:job{chunk.job_id}:{chunk.i}" + return f"{chunk.prefix}:{chunk.i}" + + grouped_nodes = {group_name(chunk): chunk for chunk in grouped_nodes} + grouped_nodes_tpu = {group_name_tpu(chunk): chunk for chunk in grouped_nodes_tpu} + return grouped_nodes, grouped_nodes_tpu + + +def start_tpu(data): + tpu = data["tpu"] + node = data["node"] + if len(node) == 1: + node = node[0] + log.debug( + f"Will create a TPU of type {tpu.node_type} tf_version {tpu.tf_version} in zone {tpu.zone} with name {node}" + ) + tpunode = tpu.get_node(node) + if tpunode is None: + if not tpu.create_node(nodename=node): + log.error("Error creating tpu node {node}") + else: + if tpu.preserve_tpu: + if not tpu.start_node(nodename=node): + log.error("Error starting tpu node {node}") + else: + log.info( + f"Tpu node {node} is already created, but will not start it because nodeset does not have preserve_tpu option active." + ) + else: + log.debug( + f"Will create a multi-vm TPU of type {tpu.node_type} tf_version {tpu.tf_version} in zone {tpu.zone} with name {node[0]}" + ) + if not tpu.create_node(nodename=node): + log.error("Error creating tpu node {node}") + + +def resume_nodes(nodes: List[str], resume_data=None): + """resume nodes in nodelist""" + if not nodes: + log.info("No nodes to resume") + return + + if resume_data is None and global_resume_data is not None: + resume_data = global_resume_data.deepcopy() + + nodes = sorted(nodes, key=lkp.node_prefix) + grouped_nodes, grouped_tpu_nodes = group_nodes_bulk(nodes, resume_data) + + if log.isEnabledFor(logging.DEBUG): + # grouped_nodelists is used in later debug logs too + grouped_nodelists = { + group: to_hostlist(chunk.nodes) for group, chunk in grouped_nodes.items() + } + grouped_tpu_nodelists = { + group: to_hostlist(chunk.nodes) + for group, chunk in grouped_tpu_nodes.items() + } + log.debug( + "node bulk groups: \n{}".format(yaml.safe_dump(grouped_nodelists).rstrip()) + ) + log.debug( + "TPU node bulk groups: \n{}".format( + yaml.safe_dump(grouped_tpu_nodelists).rstrip() + ) + ) + tpu_start_data = [] + tpu_objs = {} + for group, chunk in grouped_tpu_nodes.items(): + # do not create multiple tpu_objs if nodes with the same prefix are used + if chunk.prefix not in tpu_objs.keys(): + model = chunk.nodes[0] + tpu_objs[chunk.prefix] = TPU(lkp.node_nodeset(model)) + + tpu_start_data.append({"tpu": tpu_objs[chunk.prefix], "node": chunk.nodes}) + + # make all bulkInsert requests and execute with batch + inserts = { + group: create_instances_request( + chunk.nodes, chunk.partition_name, chunk.placement_group, chunk.job_id + ) + for group, chunk in grouped_nodes.items() + } + + bulk_ops = dict( + zip(inserts.keys(), map_with_futures(ensure_execute, inserts.values())) + ) + log.debug(f"bulk_ops={yaml.safe_dump(bulk_ops)}") + started = { + group: op for group, op in bulk_ops.items() if not isinstance(op, Exception) + } + failed = { + group: err for group, err in bulk_ops.items() if isinstance(err, Exception) + } + if failed: + failed_reqs = [str(e) for e in failed.items()] + log.error("bulkInsert API failures: {}".format("; ".join(failed_reqs))) + for ident, exc in failed.items(): + down_nodes(grouped_nodes[ident].nodes, f"GCP Error: {exc._get_reason()}") + + if log.isEnabledFor(logging.DEBUG): + for group, op in started.items(): + group_nodes = grouped_nodelists[group] + name = op["name"] + gid = op["operationGroupId"] + log.debug( + f"new bulkInsert operation started: group={group} nodes={group_nodes} name={name} operationGroupId={gid}" + ) + # wait for all bulkInserts to complete and log any errors + bulk_operations = {group: wait_for_operation(op) for group, op in started.items()} + + # Start TPU after regular nodes so that regular nodes are not affected by the slower TPU nodes + log.debug(f"tpu_start_data={yaml.safe_dump(tpu_start_data)}") + execute_with_futures(start_tpu, tpu_start_data) + + all_successful_inserts = [] + + for group, bulk_op in bulk_operations.items(): + group_id = bulk_op["operationGroupId"] + bulk_op_name = bulk_op["name"] + if "error" in bulk_op: + error = bulk_op["error"]["errors"][0] + group_nodes = to_hostlist_fast(grouped_nodes[group].nodes) + log.warning( + f"bulkInsert operation errors: {error['code']} name={bulk_op_name} operationGroupId={group_id} nodes={group_nodes}" + ) + successful_inserts, failed_inserts = separate( + lambda op: "error" in op, get_insert_operations(group_id) + ) + # Apparently multiple errors are possible... so join with +. + by_error_inserts = util.groupby_unsorted( + failed_inserts, + lambda op: "+".join(err["code"] for err in op["error"]["errors"]), + ) + for code, failed_ops in by_error_inserts: + failed_nodes = {trim_self_link(op["targetLink"]): op for op in failed_ops} + hostlist = util.to_hostlist(failed_nodes) + count = len(failed_nodes) + log.error( + f"{count} instances failed to start: {code} ({hostlist}) operationGroupId={group_id}" + ) + failed_node, failed_op = next(iter(failed_nodes.items())) + msg = "; ".join( + f"{err['code']}: {err['message'] if 'message' in err else 'no message'}" + for err in failed_op["error"]["errors"] + ) + if code != "RESOURCE_ALREADY_EXISTS": + down_nodes(hostlist, f"GCP Error: {msg}") + log.error( + f"errors from insert for node '{failed_node}' ({failed_op['name']}): {msg}" + ) + + ready_nodes = {trim_self_link(op["targetLink"]) for op in successful_inserts} + if len(ready_nodes) > 0: + ready_nodelist = to_hostlist_fast(ready_nodes) + log.info(f"created {len(ready_nodes)} instances: nodes={ready_nodelist}") + all_successful_inserts.extend(successful_inserts) + + +def update_job_comment(nodelist: str, comment: str): + if global_resume_data is None: + log.warning( + "Cannot update and notify jobs with API failures as no valid resume file is present." + ) + return + + nodes = util.to_hostnames(nodelist) + job_list = ( + job + for job in global_resume_data.jobs + if any(map(lambda node: node in nodes, util.to_hostnames(job.nodelist_resume))) + ) + for job in job_list: + run(f"{lkp.scontrol} update jobid={job.job_id} admincomment='{comment}'") + run(f"{lkp.scontrol} notify {job.job_id} '{comment}'") + + +def down_nodes(nodelist, reason): + """set nodes down with reason""" + if isinstance(nodelist, list): + nodelist = util.to_hostlist(nodelist) + update_job_comment(nodelist, reason) + run(f"{lkp.scontrol} update nodename={nodelist} state=down reason='{reason}'") + + +def hold_job(job_id, reason): + """hold job, set comment to reason""" + run(f"{lkp.scontrol} hold jobid={job_id}") + run(f"{lkp.scontrol} update jobid={job_id} comment='{reason}'") + + +def create_placement_request(pg_name, region): + config = { + "name": pg_name, + "region": region, + "groupPlacementPolicy": { + "collocation": "COLLOCATED", + }, + } + if lkp.cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.pre_placement_group_insert( + lkp=lkp, pg_name=pg_name, region=region, request_body=config + ) + request = util.compute.resourcePolicies().insert( + project=cfg.project, region=region, body=config + ) + log_api_request(request) + return request + + +def create_placement_groups(node_list: list, job_id=0): + pgs = {} + node_map = lkp.nodeset_map(node_list) + for _, nodes in node_map.items(): + pgs.update(create_nodeset_placement_groups(nodes, job_id=job_id)) + return pgs + + +def create_nodeset_placement_groups(node_list: list, job_id=0): + model = next(iter(node_list)) + nodeset = lkp.node_nodeset(model) + if not nodeset.enable_placement: + return {None: node_list} + if not valid_placement_nodes(node_list): + return {None: node_list} + region = lkp.node_region(model) + + groups = { + f"{cfg.slurm_cluster_name}-{nodeset.nodeset_name}-{job_id}-{i}": nodes + for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) + } + + if log.isEnabledFor(logging.DEBUG): + debug_groups = { + group: to_hostlist_fast(nodes) for group, nodes in groups.items() + } + log.debug( + f"creating {len(groups)} placement groups: \n{yaml.safe_dump(debug_groups).rstrip()}" + ) + requests = { + group: create_placement_request(group, region) + for group, incl_nodes in groups.items() + } + ops = dict( + zip(requests.keys(), map_with_futures(ensure_execute, requests.values())) + ) + + def classify_result(item): + op = item[1] + if not isinstance(op, Exception): + return "submitted" + if all(e.get("reason") == "alreadyExists" for e in op.error_details): + return "redundant" + return "failed" + + grouped_ops = dict(util.groupby_unsorted(list(ops.items()), classify_result)) + submitted, redundant, failed = ( + dict(grouped_ops.get(key, {})) for key in ("submitted", "redundant", "failed") + ) + if redundant: + log.warning( + "placement policies already exist: {}".format(",".join(redundant.keys())) + ) + if failed: + reqs = [f"{e}" for _, e in failed.values()] + log.fatal("failed to create placement policies: {}".format("; ".join(reqs))) + operations = {group: wait_for_operation(op) for group, op in submitted.items()} + for group, op in operations.items(): + if "error" in op: + msg = "; ".join( + f"{err['code']}: {err['message'] if 'message' in err else 'no message'}" + for err in op["error"]["errors"] + ) + log.error( + f"placement group failed to create: '{group}' ({op['name']}): {msg}" + ) + + log.info( + f"created {len(operations)} placement groups ({to_hostlist_fast(operations.keys())})" + ) + return groups + + +def valid_placement_nodes(nodelist): + invalid_types = frozenset(["e2", "t2d", "n1", "t2a", "m1", "m2", "m3"]) + for node in nodelist: + mt = lkp.node_template_info(node).machineType + if mt.split("-")[0] in invalid_types: + log.warn(f"Unsupported machine type for placement policy: {mt}.") + log.warn( + f"Please do not use any the following machine types with placement policy: ({','.join(invalid_types)})" + ) + return False + return True + + +def get_resume_file_data(): + SLURM_RESUME_FILE = os.getenv("SLURM_RESUME_FILE") + if SLURM_RESUME_FILE is None: + log.warning( + "SLURM_RESUME_FILE was not in environment. Cannot get detailed job, node, partition allocation data." + ) + return None + resume_file = Path(SLURM_RESUME_FILE) + resume_json = resume_file.read_text() + if args.loglevel == logging.DEBUG: + (dirs.scripts / "resume_data.json").write_text(resume_json) + return NSDict(json.loads(resume_json)) + + +def main(nodelist, force=False): + """main called when run as script""" + log.debug(f"ResumeProgram {nodelist}") + # Filter out nodes not in config.yaml + other_nodes, pm_nodes = separate( + lkp.is_power_managed_node, util.to_hostnames(nodelist) + ) + if other_nodes: + log.debug( + f"Ignoring non-power-managed nodes '{to_hostlist_fast(other_nodes)}' from '{nodelist}'" + ) + + pm_nodelist = util.to_hostlist_fast(pm_nodes) + if pm_nodes: + log.debug(f"Resuming nodes '{pm_nodelist}' from '{nodelist}'") + else: + log.debug("No nodes to resume") + return + + log.info(f"resume {pm_nodelist}") + resume_nodes(pm_nodes, global_resume_data) + # TODO only run below if resume_nodes succeeds but + # resume_nodes does not currently return any status. + if lkp.cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.post_main_resume_nodes( + nodelist=nodelist, global_resume_data=global_resume_data + ) + + +parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter +) +parser.add_argument("nodelist", help="list of nodes to resume") +parser.add_argument( + "--force", + "-f", + "--static", + action="store_true", + help="Force attempted creation of the nodelist, whether nodes are exclusive or not.", +) +parser.add_argument( + "--debug", + "-d", + dest="loglevel", + action="store_const", + const=logging.DEBUG, + default=logging.INFO, + help="Enable debugging output", +) +parser.add_argument( + "--trace-api", + "-t", + action="store_true", + help="Enable detailed api request output", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + if cfg.enable_debug_logging: + args.loglevel = logging.DEBUG + if args.trace_api: + cfg.extra_logging_flags = list(cfg.extra_logging_flags) + cfg.extra_logging_flags.append("trace_api") + util.chown_slurm(LOGFILE, mode=0o600) + util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) + sys.excepthook = util.handle_exception + + global_resume_data = get_resume_file_data() + main(args.nodelist, args.force) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py new file mode 100755 index 0000000000..92d14bc002 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -0,0 +1,545 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import shutil +import subprocess +import sys +import stat +import time +from pathlib import Path + +import util +from util import ( + lkp, + cfg, + dirs, + slurmdirs, + run, + install_custom_scripts, +) + +from conf import ( + install_slurm_conf, + install_slurmdbd_conf, + gen_cloud_conf, + gen_cloud_gres_conf, + gen_topology_conf, + install_gres_conf, + install_cgroup_conf, + install_topology_conf, + install_jobsubmit_lua, + login_nodeset, +) +from slurmsync import sync_slurm + +from setup_network_storage import ( + setup_network_storage, + setup_nfs_exports, +) + +SETUP_SCRIPT = Path(__file__) +filename = SETUP_SCRIPT.name +LOGFILE = ((cfg.slurm_log_dir if cfg else ".") / SETUP_SCRIPT).with_suffix(".log") +log = logging.getLogger(filename) + + +MOTD_HEADER = """ + SSSSSSS + SSSSSSSSS + SSSSSSSSS + SSSSSSSSS + SSSS SSSSSSS SSSS + SSSSSS SSSSSS + SSSSSS SSSSSSS SSSSSS + SSSS SSSSSSSSS SSSS + SSS SSSSSSSSS SSS + SSSSS SSSS SSSSSSSSS SSSS SSSSS + SSS SSSSSS SSSSSSSSS SSSSSS SSS + SSSSSS SSSSSSS SSSSSS + SSS SSSSSS SSSSSS SSS + SSSSS SSSS SSSSSSS SSSS SSSSS + S SSS SSSSSSSSS SSS S + SSS SSSS SSSSSSSSS SSSS SSS + S SSS SSSSSS SSSSSSSSS SSSSSS SSS S + SSSSS SSSSSS SSSSSSSSS SSSSSS SSSSS + S SSSSS SSSS SSSSSSS SSSS SSSSS S + S SSS SSS SSS SSS S + S S S S + SSS + SSS + SSS + SSS + SSSSSSSSSSSS SSS SSSS SSSS SSSSSSSSS SSSSSSSSSSSSSSSSSSSS +SSSSSSSSSSSSS SSS SSSS SSSS SSSSSSSSSS SSSSSSSSSSSSSSSSSSSSSS +SSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS +SSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS +SSSSSSSSSSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS + SSSSSSSSSSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS + SSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS + SSSS SSS SSSS SSSS SSSS SSSS SSSS SSSS +SSSSSSSSSSSSS SSS SSSSSSSSSSSSSSS SSSS SSSS SSSS SSSS +SSSSSSSSSSSS SSS SSSSSSSSSSSSS SSSS SSSS SSSS SSSS + +""" + + +def start_motd(): + """advise in motd that slurm is currently configuring""" + wall_msg = "*** Slurm is currently being configured in the background. ***" + motd_msg = MOTD_HEADER + wall_msg + "\n\n" + Path("/etc/motd").write_text(motd_msg) + util.run(f"wall -n '{wall_msg}'", timeout=30) + + +def end_motd(broadcast=True): + """modify motd to signal that setup is complete""" + Path("/etc/motd").write_text(MOTD_HEADER) + + if not broadcast: + return + + run( + "wall -n '*** Slurm {} setup complete ***'".format(lkp.instance_role), + timeout=30, + ) + if lkp.instance_role != "controller": + run( + """wall -n ' +/home on the controller was mounted over the existing /home. +Log back in to ensure your home directory is correct. +'""", + timeout=30, + ) + + +def failed_motd(): + """modify motd to signal that setup is failed""" + wall_msg = f"*** Slurm setup failed! Please view log: {LOGFILE} ***" + motd_msg = MOTD_HEADER + wall_msg + "\n\n" + Path("/etc/motd").write_text(motd_msg) + util.run(f"wall -n '{wall_msg}'", timeout=30) + + +def run_custom_scripts(): + """run custom scripts based on instance_role""" + custom_dir = dirs.custom_scripts + if lkp.instance_role == "controller": + # controller has all scripts, but only runs controller.d + custom_dirs = [custom_dir / "controller.d"] + elif lkp.instance_role == "compute": + # compute setup with compute.d and nodeset.d + custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"] + elif lkp.instance_role == "login": + # login setup with only login.d + custom_dirs = [custom_dir / "login.d"] + else: + # Unknown role: run nothing + custom_dirs = [] + custom_scripts = [ + p + for d in custom_dirs + for p in d.rglob("*") + if p.is_file() and not p.name.endswith(".disabled") + ] + print_scripts = ",".join(str(s.relative_to(custom_dir)) for s in custom_scripts) + log.debug(f"custom scripts to run: {custom_dir}/({print_scripts})") + + try: + for script in custom_scripts: + if "/controller.d/" in str(script): + timeout = lkp.cfg.get("controller_startup_scripts_timeout", 300) + elif "/compute.d/" in str(script) or "/nodeset.d/" in str(script): + timeout = lkp.cfg.get("compute_startup_scripts_timeout", 300) + elif "/login.d/" in str(script): + timeout = lkp.cfg.get("login_startup_scripts_timeout", 300) + else: + timeout = 300 + timeout = None if not timeout or timeout < 0 else timeout + log.info(f"running script {script.name} with timeout={timeout}") + result = run(str(script), timeout=timeout, check=False, shell=True) + runlog = ( + f"{script.name} returncode={result.returncode}\n" + f"stdout={result.stdout}stderr={result.stderr}" + ) + log.info(runlog) + result.check_returncode() + except OSError as e: + log.error(f"script {script} is not executable") + raise e + except subprocess.TimeoutExpired as e: + log.error(f"script {script} did not complete within timeout={timeout}") + raise e + except Exception as e: + log.error(f"script {script} encountered an exception") + log.exception(e) + raise e + + +def setup_secondary_disks(): + """Format and mount secondary disk""" + run( + "sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb" + ) + with open("/etc/fstab", "a") as f: + f.write( + "\n/dev/sdb {0} ext4 discard,defaults,nofail 0 2".format( + dirs.secdisk + ) + ) + + +def setup_jwt_key(): + jwt_key = Path(slurmdirs.state / "jwt_hs256.key") + + if jwt_key.exists(): + log.info("JWT key already exists. Skipping key generation.") + else: + run("dd if=/dev/urandom bs=32 count=1 > " + str(jwt_key), shell=True) + + util.chown_slurm(jwt_key, mode=0o400) + + +def setup_munge_key(): + munge_key = Path(dirs.munge / "munge.key") + + if munge_key.exists(): + log.info("Munge key already exists. Skipping key generation.") + else: + run("create-munge-key -f", timeout=30) + + shutil.chown(munge_key, user="munge", group="munge") + os.chmod(munge_key, stat.S_IRUSR) + run("systemctl restart munge", timeout=30) + + +def setup_nss_slurm(): + """install and configure nss_slurm""" + # setup nss_slurm + util.mkdirp(Path("/var/spool/slurmd")) + run( + "ln -s {}/lib/libnss_slurm.so.2 /usr/lib64/libnss_slurm.so.2".format( + slurmdirs.prefix + ), + check=False, + ) + run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf") + + +def setup_sudoers(): + content = """ +# Allow SlurmUser to manage the slurm daemons +slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service +slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service +""" + sudoers_file = Path("/etc/sudoers.d/slurm") + sudoers_file.write_text(content) + sudoers_file.chmod(0o0440) + + +def update_system_config(file, content): + """Add system defaults options for service files""" + sysconfig = Path("/etc/sysconfig") + default = Path("/etc/default") + + if sysconfig.exists(): + conf_dir = sysconfig + elif default.exists(): + conf_dir = default + else: + raise Exception("Cannot determine system configuration directory.") + + slurmd_file = Path(conf_dir, file) + slurmd_file.write_text(content) + + +def configure_mysql(): + cnfdir = Path("/etc/my.cnf.d") + if not cnfdir.exists(): + cnfdir = Path("/etc/mysql/conf.d") + if not (cnfdir / "mysql_slurm.cnf").exists(): + (cnfdir / "mysql_slurm.cnf").write_text( + """ +[mysqld] +bind-address=127.0.0.1 +innodb_buffer_pool_size=1024M +innodb_log_file_size=64M +innodb_lock_wait_timeout=900 +""" + ) + run("systemctl enable mariadb", timeout=30) + run("systemctl restart mariadb", timeout=30) + + mysql = "mysql -u root -e" + run(f"""{mysql} "drop user 'slurm'@'localhost'";""", timeout=30, check=False) + run(f"""{mysql} "create user 'slurm'@'localhost'";""", timeout=30) + run( + f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'localhost'";""", + timeout=30, + ) + run( + f"""{mysql} "drop user 'slurm'@'{lkp.control_host}'";""", + timeout=30, + check=False, + ) + run(f"""{mysql} "create user 'slurm'@'{lkp.control_host}'";""", timeout=30) + run( + f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'{lkp.control_host}'";""", + timeout=30, + ) + + +def configure_dirs(): + for p in dirs.values(): + util.mkdirp(p) + util.chown_slurm(dirs.slurm) + util.chown_slurm(dirs.scripts) + + for p in slurmdirs.values(): + util.mkdirp(p) + util.chown_slurm(p) + + etc_slurm = Path("/etc/slurm") + if etc_slurm.exists() and etc_slurm.is_symlink(): + etc_slurm.unlink() + etc_slurm.symlink_to(slurmdirs.etc) + + scripts_etc = dirs.scripts / "etc" + if scripts_etc.exists() and scripts_etc.is_symlink(): + scripts_etc.unlink() + scripts_etc.symlink_to(slurmdirs.etc) + + scripts_log = dirs.scripts / "log" + if scripts_log.exists() and scripts_log.is_symlink(): + scripts_log.unlink() + scripts_log.symlink_to(dirs.log) + + +def setup_controller(args): + """Run controller setup""" + log.info("Setting up controller") + util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) + install_custom_scripts() + + install_slurm_conf(lkp) + install_slurmdbd_conf(lkp) + + gen_cloud_conf(lkp) + gen_cloud_gres_conf(lkp) + gen_topology_conf(lkp) + install_gres_conf(lkp) + install_cgroup_conf(lkp) + install_topology_conf(lkp) + install_jobsubmit_lua(lkp) + + setup_jwt_key() + setup_munge_key() + setup_sudoers() + + if cfg.controller_secondary_disk: + setup_secondary_disks() + setup_network_storage(log) + + run_custom_scripts() + + if not cfg.cloudsql_secret: + configure_mysql() + + run("systemctl enable slurmdbd", timeout=30) + run("systemctl restart slurmdbd", timeout=30) + + # Wait for slurmdbd to come up + time.sleep(5) + + sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" + result = run( + f"{sacctmgr} add cluster {cfg.slurm_cluster_name}", timeout=30, check=False + ) + if "already exists" in result.stdout: + log.info(result.stdout) + elif result.returncode > 1: + result.check_returncode() # will raise error + + run("systemctl enable slurmctld", timeout=30) + run("systemctl restart slurmctld", timeout=30) + + run("systemctl enable slurmrestd", timeout=30) + run("systemctl restart slurmrestd", timeout=30) + + # Export at the end to signal that everything is up + run("systemctl enable nfs-server", timeout=30) + run("systemctl start nfs-server", timeout=30) + + setup_nfs_exports() + run("systemctl enable --now slurmcmd.timer", timeout=30) + + log.info("Check status of cluster services") + run("systemctl status munge", timeout=30) + run("systemctl status slurmdbd", timeout=30) + run("systemctl status slurmctld", timeout=30) + run("systemctl status slurmrestd", timeout=30) + + sync_slurm() + run("systemctl enable slurm_load_bq.timer", timeout=30) + run("systemctl start slurm_load_bq.timer", timeout=30) + run("systemctl status slurm_load_bq.timer", timeout=30) + + log.info("Done setting up controller") + pass + + +def setup_login(args): + """run login node setup""" + log.info("Setting up login") + slurmctld_host = f"{lkp.control_host}" + if lkp.control_addr: + slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmd_options = [ + f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + f'--conf="Feature={login_nodeset}"', + "-Z", + ] + sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'""" + update_system_config("slurmd", sysconf) + install_custom_scripts() + + setup_network_storage(log) + setup_sudoers() + run("systemctl restart munge") + run("systemctl enable slurmd", timeout=30) + run("systemctl restart slurmd", timeout=30) + run("systemctl enable --now slurmcmd.timer", timeout=30) + + run_custom_scripts() + + log.info("Check status of cluster services") + run("systemctl status munge", timeout=30) + run("systemctl status slurmd", timeout=30) + + log.info("Done setting up login") + + +def setup_compute(args): + """run compute node setup""" + log.info("Setting up compute") + util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) + slurmctld_host = f"{lkp.control_host}" + if lkp.control_addr: + slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmd_options = [ + f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + ] + if args.slurmd_feature is not None: + slurmd_options.append(f'--conf="Feature={args.slurmd_feature}"') + slurmd_options.append("-Z") + sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'""" + update_system_config("slurmd", sysconf) + install_custom_scripts() + + setup_nss_slurm() + setup_network_storage(log) + + has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode + if has_gpu: + run("nvidia-smi") + + run_custom_scripts() + + setup_sudoers() + run("systemctl restart munge", timeout=30) + run("systemctl enable slurmd", timeout=30) + run("systemctl restart slurmd", timeout=30) + run("systemctl enable --now slurmcmd.timer", timeout=30) + + log.info("Check status of cluster services") + run("systemctl status munge", timeout=30) + run("systemctl status slurmd", timeout=30) + + log.info("Done setting up compute") + + +def main(args): + start_motd() + configure_dirs() + + # call the setup function for the instance type + setup = dict.get( + { + "controller": setup_controller, + "compute": setup_compute, + "login": setup_login, + }, + lkp.instance_role, + lambda: log.fatal(f"Unknown node role: {lkp.instance_role}"), + ) + setup(args) + + end_motd() + + +if __name__ == "__main__": + util.chown_slurm(LOGFILE, mode=0o600) + + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--slurmd-feature", + dest="slurmd_feature", + help="Feature for slurmd to register with. Controller ignores this option.", + ) + args = parser.parse_args() + + util.config_root_logger(filename, logfile=LOGFILE) + sys.excepthook = util.handle_exception + + lkp = util.Lookup(cfg) # noqa F811 + + try: + main(args) + except subprocess.TimeoutExpired as e: + log.error( + f"""TimeoutExpired: + command={e.cmd} + timeout={e.timeout} + stdout: +{e.stdout.strip()} + stderr: +{e.stderr.strip()} +""" + ) + log.error("Aborting setup...") + failed_motd() + except subprocess.CalledProcessError as e: + log.error( + f"""CalledProcessError: + command={e.cmd} + returncode={e.returncode} + stdout: +{e.stdout.strip()} + stderr: +{e.stderr.strip()} +""" + ) + log.error("Aborting setup...") + failed_motd() + except Exception as e: + log.exception(e) + log.error("Aborting setup...") + failed_motd() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py new file mode 100755 index 0000000000..b3283dd341 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import stat +import time + +import shutil +from pathlib import Path +from concurrent.futures import as_completed +from addict import Dict as NSDict + +import util +from util import lkp, run, cfg, dirs, separate + + +def mounts_by_local(mounts): + """convert list of mounts to dict of mounts, local_mount as key""" + return {str(Path(m.local_mount).resolve()): m for m in mounts} + + +def resolve_network_storage(nodeset=None): + """Combine appropriate network_storage fields to a single list""" + + if lkp.instance_role == "compute": + try: + nodeset = lkp.node_nodeset() + except Exception: + # External nodename, skip lookup + nodeset = None + + # seed mounts with the default controller mounts + if cfg.disable_default_mounts: + default_mounts = [] + else: + default_mounts = [ + NSDict( + { + "server_ip": lkp.control_addr or lkp.control_host, + "remote_mount": str(path), + "local_mount": str(path), + "fs_type": "nfs", + "mount_options": "defaults,hard,intr", + } + ) + for path in ( + dirs.home, + dirs.apps, + ) + ] + + # create dict of mounts, local_mount: mount_info + mounts = mounts_by_local(default_mounts) + + # On non-controller instances, entries in network_storage could overwrite + # default exports from the controller. Be careful, of course + mounts.update(mounts_by_local(cfg.network_storage)) + if lkp.instance_role in ("login", "controller"): + mounts.update(mounts_by_local(cfg.login_network_storage)) + + if nodeset is not None: + mounts.update(mounts_by_local(nodeset.network_storage)) + return list(mounts.values()) + + +def separate_external_internal_mounts(mounts): + """separate into cluster-external and internal mounts""" + + def internal_mount(mount): + # NOTE: Valid Lustre server_ip can take the form of '@tcp' + server_ip = mount.server_ip.split("@")[0] + mount_addr = util.host_lookup(server_ip) + return mount_addr == lkp.control_host_addr + + return separate(internal_mount, mounts) + + +def setup_network_storage(log): + """prepare network fs mounts and add them to fstab""" + log.info("Set up network storage") + # filter mounts into two dicts, cluster-internal and external mounts + + all_mounts = resolve_network_storage() + ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) + + if lkp.instance_role == "controller": + mounts = ext_mounts + else: + mounts = ext_mounts + int_mounts + + # Determine fstab entries and write them out + fstab_entries = [] + for mount in mounts: + local_mount = Path(mount.local_mount) + remote_mount = mount.remote_mount + fs_type = mount.fs_type + server_ip = mount.server_ip or "" + util.mkdirp(local_mount) + + log.info( + "Setting up mount ({}) {}{} to {}".format( + fs_type, + server_ip + ":" if fs_type != "gcsfuse" else "", + remote_mount, + local_mount, + ) + ) + + mount_options = mount.mount_options.split(",") if mount.mount_options else [] + if not mount_options or "_netdev" not in mount_options: + mount_options += ["_netdev"] + + if fs_type == "gcsfuse": + fstab_entries.append( + "{0} {1} {2} {3} 0 0".format( + remote_mount, local_mount, fs_type, ",".join(mount_options) + ) + ) + else: + fstab_entries.append( + "{0}:{1} {2} {3} {4} 0 0".format( + server_ip, + remote_mount, + local_mount, + fs_type, + ",".join(mount_options), + ) + ) + + fstab = Path("/etc/fstab") + if not Path(fstab.with_suffix(".bak")).is_file(): + shutil.copy2(fstab, fstab.with_suffix(".bak")) + shutil.copy2(fstab.with_suffix(".bak"), fstab) + with open(fstab, "a") as f: + f.write("\n") + for entry in fstab_entries: + f.write(entry) + f.write("\n") + + mount_fstab(mounts_by_local(mounts), log) + munge_mount_handler(log) + + +def mount_fstab(mounts, log): + """Wait on each mount, then make sure all fstab is mounted""" + from more_executors import Executors, ExceptionRetryPolicy + + def mount_path(path): + log.info(f"Waiting for '{path}' to be mounted...") + try: + run(f"mount {path}", timeout=120) + except Exception as e: + exc_type, _, _ = sys.exc_info() + log.error(f"mount of path '{path}' failed: {exc_type}: {e}") + raise e + log.info(f"Mount point '{path}' was mounted.") + + MAX_MOUNT_TIMEOUT = 60 * 5 + future_list = [] + retry_policy = ExceptionRetryPolicy( + max_attempts=40, exponent=1.6, sleep=1.0, max_sleep=16.0 + ) + with Executors.thread_pool().with_timeout(MAX_MOUNT_TIMEOUT).with_retry( + retry_policy=retry_policy + ) as exe: + for path in mounts: + future = exe.submit(mount_path, path) + future_list.append(future) + + # Iterate over futures, checking for exceptions + for future in as_completed(future_list): + try: + future.result() + except Exception as e: + raise e + + +def munge_mount_handler(log): + if not cfg.munge_mount: + log.error("Missing munge_mount in cfg") + elif lkp.instance_role == "controller": + return + + mount = cfg.munge_mount + server_ip = ( + mount.server_ip + if mount.server_ip + else (cfg.slurm_control_addr or cfg.slurm_control_host) + ) + remote_mount = mount.remote_mount + local_mount = Path("/mnt/munge") + fs_type = mount.fs_type if mount.fs_type is not None else "nfs" + mount_options = ( + mount.mount_options + if mount.mount_options is not None + else "defaults,hard,intr,_netdev" + ) + + munge_key = Path(dirs.munge / "munge.key") + + log.info(f"Mounting munge share to: {local_mount}") + local_mount.mkdir() + if fs_type.lower() == "gcsfuse".lower(): + if remote_mount is None: + remote_mount = "" + cmd = [ + "gcsfuse", + f"--only-dir={remote_mount}" if remote_mount != "" else None, + server_ip, + str(local_mount), + ] + else: + if remote_mount is None: + remote_mount = Path("/etc/munge") + cmd = [ + "mount", + f"--types={fs_type}", + f"--options={mount_options}" if mount_options != "" else None, + f"{server_ip}:{remote_mount}", + str(local_mount), + ] + # wait max 120s for munge mount + timeout = 120 + for retry, wait in enumerate(util.backoff_delay(0.5, timeout), 1): + try: + run(cmd, timeout=timeout) + break + except Exception as e: + log.error( + f"munge mount failed: '{cmd}' {e}, try {retry}, waiting {wait:0.2f}s" + ) + time.sleep(wait) + err = e + continue + else: + raise err + + log.info(f"Copy munge.key from: {local_mount}") + shutil.copy2(Path(local_mount / "munge.key"), munge_key) + + log.info("Restrict permissions of munge.key") + shutil.chown(munge_key, user="munge", group="munge") + os.chmod(munge_key, stat.S_IRUSR) + + log.info(f"Unmount {local_mount}") + if fs_type.lower() == "gcsfuse".lower(): + run(f"fusermount -u {local_mount}", timeout=120) + else: + run(f"umount {local_mount}", timeout=120) + shutil.rmtree(local_mount) + + +def setup_nfs_exports(): + """nfs export all needed directories""" + # The controller only needs to set up exports for cluster-internal mounts + # switch the key to remote mount path since that is what needs exporting + mounts = resolve_network_storage() + # manually add munge_mount + mounts.append( + NSDict( + { + "server_ip": cfg.munge_mount.server_ip, + "remote_mount": cfg.munge_mount.remote_mount, + "local_mount": Path(f"{dirs.munge}_tmp"), + "fs_type": cfg.munge_mount.fs_type, + "mount_options": cfg.munge_mount.mount_options, + } + ) + ) + # controller mounts + _, con_mounts = separate_external_internal_mounts(mounts) + con_mounts = {m.remote_mount: m for m in con_mounts} + for nodeset in cfg.nodeset.values(): + # get internal mounts for each nodeset by calling + # resolve_network_storage as from a node in each nodeset + ns_mounts = resolve_network_storage(nodeset=nodeset) + _, int_mounts = separate_external_internal_mounts(ns_mounts) + con_mounts.update({m.remote_mount: m for m in int_mounts}) + + # export path if corresponding selector boolean is True + exports = [] + for path in con_mounts: + util.mkdirp(Path(path)) + run(rf"sed -i '\#{path}#d' /etc/exports", timeout=30) + exports.append(f"{path} *(rw,no_subtree_check,no_root_squash)") + + exportsd = Path("/etc/exports.d") + util.mkdirp(exportsd) + with (exportsd / "slurm.exports").open("w") as f: + f.write("\n") + f.write("\n".join(exports)) + run("exportfs -a", timeout=30) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md new file mode 100644 index 0000000000..7c73936327 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md @@ -0,0 +1,107 @@ +# Plugin mechanism for slurm-gcp + +## Introduction + +Slurm in general provides many hooks for customization of its various functions. +In fact - slurm-gcp is using one of these customization points, PrologSlurmctld, +to perform tasks related to VM instance creation as a response to job node +allocation. + +The plugin mechanism in this directory similarly allows deployment specific +customizations to slurm-gcp by dropping Python modules in +`/scripts/slurm_gcp_plugins` and enabling plugins setting the +configuration directive `enable_slurm_gcp_plugins = true` in +`/scripts/config.yaml` + +A very basic `test_plugin`, is provided as an example. + +## Plugins + +Callbacks to registered plugins can be made from various places in resume.py and +suspend.py. The following callbacks are currently made: + +### Callback function signature + +Callback functions in the plugins are recommended to be declared as follows: + +```python +def post_main_resume_nodes(*pos_args, **keyword_args): +... +``` + +and extract arguments from `keyword_args`. Check the callback sites to +understand which values that are available. + +### Current callback sites: + +Callbacks are currently performed from the following places: + +#### scripts/resume.py:main_resume_nodes + +At the end of main the following callback is called + +```python +def post_main_resume_nodes(*pos_args, **keyword_args): +``` + +The primary intention is allow a plugin to record details about the instance +and/or setup/change properties for which the VMs needs to be up and running. + +Currently the call is made regardless of if the the resume node operation +succeeded or not. + +#### scripts/resume.py:create_instances_request + +In create_instances_request just before the bulk instance insert is called, the +following callback is called + +```python +def pre_instance_bulk_insert(*pos_args, **keyword_args): +``` + +The primary intention is allow a plugin to modify the instance creation request. + +#### scripts/resume.py:create_placement_request + +In create_instances_request just before the resource policy creation, the +following callback is called + +```python +def pre_placement_group_insert(*pos_args, **keyword_args): +``` + +The primary intention is allow a plugin to modify the resource policy creation +request. + +#### scripts/suspend.py:main_suspend_nodes + +In main just before the VMs are deleted but while they still (should) exist, the +following callback is called + +```python +def pre_main_suspend_nodes(*pos_args, **keyword_args): +``` + +The primary intention is allow a plugin to cleanup or record details while the +node still exists. + +#### scripts/util.py:instances + +Just before the per-instance information is requested the following callback is +called: + +```python +def register_instance_information_fields(*pos_args, **keyword_args): +``` + +The primary intention is allow a plugin to add information to the per instance +lookup. + +### Logging and error handling + +Plugin functions are recommended to use `logging` to communicate information, +warnings and errors. The `slurm_gcp_plugins` registry tries to isolate the +caller of the callbacks (i.e. resume.py and suspend.py) from effects of errors +with a general try-catch wrapper for each plugin callback. However - as the +callback happens in the same process there are notable limits on how much +isolation that can be achieved. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py new file mode 100644 index 0000000000..a4f11079b1 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py @@ -0,0 +1,135 @@ +import importlib +import pkgutil +import logging +import inspect + +# Only perform discovery at init +discovered_plugins = { + name.lstrip("."): importlib.import_module(name=name, package="slurm_gcp_plugins") + for finder, name, ispkg in pkgutil.iter_modules(path=__path__, prefix=".") + if name.lstrip(".") != "utils" +} + +logging.info( + ( + "slurm_gcp_plugins found:" + + ", ".join( + [ + "slurm_gcp_plugins" + plugin + for plugin in sorted(discovered_plugins.keys()) + ] + ) + ) +) + + +def get_plugins(): + return discovered_plugins + + +def get_plugins_function(function_name): + plugins = get_plugins() + + return { + plugin: function + for plugin in sorted(plugins.keys()) + for name, function in inspect.getmembers(plugins[plugin], inspect.isfunction) + if name == function_name + } + + +def run_plugins_for_function(plugin_function_name, pos_args, keyword_args): + if "lkp" not in keyword_args: + logging.error( + ( + f"Plugin callback {plugin_function_name} called" + + 'without a "lkp" argument need to get obtain deployment' + + "information" + ) + ) + return + + if not keyword_args["lkp"].cfg: + logging.error( + ( + f"Plugin callback {plugin_function_name} called" + + 'with "lkp.cfg" unpopulated. lkp.cfg is needed' + + "to argument need to get obtain deployment" + + "information" + ) + ) + return + + cfg = keyword_args["lkp"].cfg + if cfg.enable_slurm_gcp_plugins: + for plugin, function in get_plugins_function(plugin_function_name).items(): + if plugin in cfg.enable_slurm_gcp_plugins: + logging.debug(f"Running {function} from plugin {plugin}") + try: + function(*pos_args, **keyword_args) + except BaseException as e: + logging.error( + f"Plugin callback {plugin}:{function} caused an exception: {e}" + ) + else: + logging.debug( + f"Not running {function} from non-enabled plugin {plugin}" + ) + + +# Implement this function to add fields to the cached VM instance lookup +def register_instance_information_fields(*pos_args, **keyword_args): + run_plugins_for_function( + plugin_function_name="register_instance_information_fields", + pos_args=pos_args, + keyword_args=keyword_args, + ) + + +# Called just after VM instances have been created and are up +def post_main_resume_nodes(*pos_args, **keyword_args): + run_plugins_for_function( + plugin_function_name="post_main_resume_nodes", + pos_args=pos_args, + keyword_args=keyword_args, + ) + + +# Called just before VM instances are deleted should be still up +# (NOTE: if a node has failed it might not be up or unresponsive) +def pre_main_suspend_nodes(*pos_args, **keyword_args): + run_plugins_for_function( + plugin_function_name="pre_main_suspend_nodes", + pos_args=pos_args, + keyword_args=keyword_args, + ) + + +# Called just before VM instances are created are created with +# bulkInsert- this function can be implemented to inspect and/or +# modify the insertion request. +def pre_instance_bulk_insert(*pos_args, **keyword_args): + run_plugins_for_function( + plugin_function_name="pre_instance_bulk_insert", + pos_args=pos_args, + keyword_args=keyword_args, + ) + + +# Called just before placement groups are created - this function can +# be implemented to inspect and/or modify the insertion request. +def pre_placement_group_insert(*pos_args, **keyword_args): + run_plugins_for_function( + plugin_function_name="pre_placement_group_insert", + pos_args=pos_args, + keyword_args=keyword_args, + ) + + +__all__ = [ + "post_main_resume_nodes", + "pre_main_suspend_nodes", + "register_instance_information_fields", + "pre_instance_bulk_insert", + "pre_placement_group_insert", +] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md new file mode 100644 index 0000000000..9e8ad4afeb --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/README.md @@ -0,0 +1,38 @@ +# max_hops slurm_gcp_plugin plugin + +## Overview + +This plugin allows placement parameters to be set controlling the max number of +network hops between nodes in dynamic jobs. + +## Usage + +### Configuration + +This plugin can be enabled by adding the following to the slurm-gcp config: + +```yaml +enable_slurm_gcp_plugins: + #possibly other plugins + max_hops: + max_hops: 1 +``` + +to set the default max_hops to, in this example, 1 for _all_ jobs. + +### Per job setting + +The max hops setting can be changed on a per job basis using the --prefer +argument e.g. as follows: + +salloc --prefer=max_hops.max_hops=1 + +to allow at most one network hop. For this to work the +`ignore_prefer_validation` needs to be added to the slurm `SchedulerParameters` +configuration item. + +## Callbacks used + +### pre_placement_group_insert + +Used to change the placement group creation request. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py new file mode 100644 index 0000000000..6505f8f47e --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py @@ -0,0 +1,58 @@ +import logging +import sys +import slurm_gcp_plugins.utils as sgp_utils + +# Allows setting a specific max_hop for jobs +# +# To enable: +# * add this directory to the slurm-gcp plugin path (usually /slurm/scripts/slurm-gcp-plugins) +# * add the following to the slurm-gcp config (usually /slurm/scripts/config.yaml): +# +# enable_slurm_gcp_plugins: +# +# max_hops: +# max_hops: +# +# +# Where can be either of 1,2,3 (in increasing order of distance) +# If no max_hops is provided but the plugins is still enabled the default level is 3 + + +def pre_placement_group_insert(*pos_args, **keyword_args): + logging.info("Trying to enable max hop") + # Avoid circular import (util imports the plugins) + if "util" in sys.modules: + logging.info("Setting compute service version to beta") + sys.modules["util"].compute = sys.modules["util"].compute_service( + version="beta" + ) + max_distance = sgp_utils.get_plugin_setting( + plugin="max_hops", + setting="max_hops", + job=get_job_from_placement_group_name(keyword_args["pg_name"]), + lkp=keyword_args["lkp"], + default=3, + ) + logging.debug(f"Setting max hop for placement policy to {max_distance}") + keyword_args["request_body"]["groupPlacementPolicy"][ + "collocation=" + ] = "COLLOCATED" + keyword_args["request_body"]["groupPlacementPolicy"][ + "maxDistance" + ] = max_distance + else: + logging.error( + "max_hops can not be set (slurm_gcp util.py must be imported by the caller of the plugin callback)" + ) + + +__all__ = [ + "pre_placement_group_insert", +] + + +# This should be replaced if the job id becomes available in the context of this plugin hook +def get_job_from_placement_group_name(pg_name): + # f"{cfg.slurm_cluster_name}-{partition_name}-{job_id}-{i}" + + return pg_name.split("-")[2] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/README.md new file mode 100644 index 0000000000..c3a46ca420 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/README.md @@ -0,0 +1,16 @@ +# Test slurm_gcp_plugin plugin + +## Overview + +This is a very basic but still useful test plugin that records the VM instance +id of the nodes used for jobs (when dynamic nodes are used). + +## Callbacks used + +### post_main_resume_nodes + +Used to log the instance id of created VMs + +### register_instance_information_fields + +Used to add the instance id to the information collected for VM instances. diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py new file mode 100644 index 0000000000..deb53f7aa9 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py @@ -0,0 +1,27 @@ +import logging + +instance_information_fields = ["resourceStatus", "id"] + + +def register_instance_information_fields(*pos_args, **keyword_args): + logging.debug("register_instance_information_fields called from test_plugin") + keyword_args["instance_information_fields"].extend(instance_information_fields) + + +def post_main_resume_nodes(*pos_args, **keyword_args): + logging.debug("post_main_resume_nodes called from test_plugin") + for node in keyword_args["nodelist"]: + logging.info( + ( + "test_plugin:" + + f"nodename:{node} " + + f"instance_id:{keyword_args['lkp'].instance(node)['id']} " + + f"physicalHost:{keyword_args['lkp'].instance(node)['resourceStatus']['physicalHost']}" + ) + ) + + +__all__ = [ + "register_instance_information_fields", + "post_main_resume_nodes", +] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py new file mode 100644 index 0000000000..6977fb5c93 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py @@ -0,0 +1,56 @@ +import subprocess +import logging + +# Various plugin utility functions + +# Plugin helper function to get plugin settings in the following order: +# +# 1. from job features with +# 2. from slurm-gcp config +# 3. If provided, the default +# 4. None + + +def get_plugin_setting(plugin, setting, lkp, job, default=None): + features = get_job_features(job) + if f"{plugin}.{setting}" in features: + return features[f"{plugin}.{setting}"] + + if "enable_slurm_gcp_plugins" in lkp.cfg: + if plugin in lkp.cfg.enable_slurm_gcp_plugins: + try: + iter(lkp.cfg.enable_slurm_gcp_plugins[plugin]) + except TypeError: + # not iterable + 1 + else: + if setting in lkp.cfg.enable_slurm_gcp_plugins[plugin]: + return lkp.cfg.enable_slurm_gcp_plugins[plugin][setting] + + return default + + +# Plugin helper function to get job features +def get_job_features(job): + if job is None: + return {} + + features = {} + res, output = subprocess.getstatusoutput(f"squeue -h -o %f -j {job}") + if res == 0: + for feature in output.split("&"): + kv = feature.split("=", 1) + v = None + if len(kv) == 2: + v = kv[1] + features[kv[0]] = v + else: + logging.error("Unable to retrieve features of job:{job}") + + return features + + +__all__ = [ + "get_plugin_setting", + "get_job_features", +] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py new file mode 100755 index 0000000000..9104be7ea3 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -0,0 +1,575 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import datetime +import fcntl +import hashlib +import json +import logging +import re +import sys +from enum import Enum +from itertools import chain +from pathlib import Path +import yaml + +import util +from util import ( + batch_execute, + ensure_execute, + execute_with_futures, + fetch_config_yaml, + fetch_config_yaml_md5, + install_custom_scripts, + load_config_file, + run, + save_config, + separate, + to_hostlist_fast, + Lookup, + NSDict, + TPU, + chunked, +) +from util import lkp, cfg, compute, CONFIG_FILE +from suspend import delete_instances +from resume import start_tpu +from conf import ( + gen_cloud_conf, + gen_cloud_gres_conf, + gen_topology_conf, + install_slurm_conf, + install_slurmdbd_conf, + install_gres_conf, + install_cgroup_conf, + install_topology_conf, +) + +filename = Path(__file__).name +LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") + +log = logging.getLogger(filename) + +TOT_REQ_CNT = 1000 + + +NodeStatus = Enum( + "NodeStatus", + ( + "orphan", + "power_down", + "preempted", + "restore", + "resume", + "terminated", + "unbacked", + "unchanged", + "unknown", + ), +) + + +def start_instance_op(inst, project=None): + project = project or lkp.project + return compute.instances().start( + project=project, + zone=lkp.instance(inst).zone, + instance=inst, + ) + + +def start_instances(node_list): + log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list))) + + normal, tpu_nodes = separate(lkp.node_is_tpu, node_list) + invalid, valid = separate(lambda inst: bool(lkp.instance), normal) + + ops = {inst: start_instance_op(inst) for inst in valid} + + done, failed = batch_execute(ops) + + tpu_start_data = [] + for ns, nodes in util.groupby_unsorted(tpu_nodes, lkp.node_nodeset_name): + tpuobj = TPU(lkp.cfg.nodeset_tpu[ns]) + for snodes in chunked(nodes, n=tpuobj.vmcount): + tpu_start_data.append({"tpu": tpuobj, "node": snodes}) + execute_with_futures(start_tpu, tpu_start_data) + + +def _find_dynamic_node_status() -> NodeStatus: + # TODO: cover more cases: + # * delete dead dynamic nodes + # * delete orhpaned instances + return NodeStatus.unchanged # don't touch dynamic nodes + + +def _find_tpu_node_status(nodename, state): + ns = lkp.node_nodeset(nodename) + tpuobj = TPU(ns) + inst = tpuobj.get_node(nodename) + # If we do not find the node but it is from a Tpu that has multiple vms look for the master node + if inst is None and tpuobj.vmcount > 1: + # Get the tpu slurm nodelist of the nodes in the same tpu group as nodename + nodelist = run( + f"{lkp.scontrol} show topo {nodename}" + + " | awk -F'=' '/Level=0/ { print $NF }'", + shell=True, + ).stdout + l_nodelist = util.to_hostnames(nodelist) + group_names = set(l_nodelist) + # get the list of all the existing tpus in the nodeset + tpus_list = set(tpuobj.list_node_names()) + # In the intersection there must be only one node that is the master + tpus_int = list(group_names.intersection(tpus_list)) + if len(tpus_int) > 1: + log.error( + f"More than one cloud tpu node for tpu group {nodelist}, there should be only one that should be {l_nodelist[0]}, but we have found {tpus_int}" + ) + return NodeStatus.unknown + if len(tpus_int) == 1: + inst = tpuobj.get_node(tpus_int[0]) + # if len(tpus_int ==0) this case is not relevant as this would be the case always that a TPU group is not running + if inst is None: + if state.base == "DOWN" and "POWERED_DOWN" in state.flags: + return NodeStatus.restore + if "POWERING_DOWN" in state.flags: + return NodeStatus.restore + if "COMPLETING" in state.flags: + return NodeStatus.unbacked + if state.base != "DOWN" and not ( + set(("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN")) + & state.flags + ): + return NodeStatus.unbacked + if nodename in find_node_status.static_nodeset: + return NodeStatus.resume + elif ( + state is not None + and "POWERED_DOWN" not in state.flags + and "POWERING_DOWN" not in state.flags + and inst.state == TPU.State.STOPPED + ): + if tpuobj.preemptible: + return NodeStatus.preempted + if not state.base.startswith("DOWN"): + return NodeStatus.terminated + elif ( + state is None or "POWERED_DOWN" in state.flags + ) and inst.state == TPU.State.READY: + return NodeStatus.orphan + elif state is None: + # if state is None here, the instance exists but it's not in Slurm + return NodeStatus.unknown + + return NodeStatus.unchanged + + +def allow_power_down(state): + config = run(f"{lkp.scontrol} show config").stdout.rstrip() + m = re.search(r"SuspendExcStates\s+=\s+(?P[\w\(\)]+)", config) + if not m: + log.warning("SuspendExcStates not found in Slurm config") + return True + states = set(m.group("states").split(",")) + if "(null)" in states or bool(state & state.flags.union(state.base)): + return False + return True + + +def find_node_status(nodename): + """Determine node/instance status that requires action""" + state = lkp.slurm_node(nodename) + + if lkp.node_is_dyn(nodename): + return _find_dynamic_node_status() + + if lkp.node_is_tpu(nodename): + return _find_tpu_node_status(nodename, state) + + # split below is workaround for VMs whose hostname is FQDN + inst = lkp.instance(nodename.split(".")[0]) + power_flags = frozenset( + ("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN") + ) & (state.flags if state is not None else set()) + + if inst is None: + if "POWERING_UP" in state.flags: + return NodeStatus.unchanged + if state.base == "DOWN" and "POWERED_DOWN" in state.flags: + return NodeStatus.restore + if "POWERING_DOWN" in state.flags: + return NodeStatus.restore + if "COMPLETING" in state.flags: + return NodeStatus.unbacked + if state.base != "DOWN" and not power_flags: + return NodeStatus.unbacked + if state.base == "DOWN" and not power_flags and allow_power_down(state): + return NodeStatus.power_down + if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename): + return NodeStatus.resume + elif ( + state is not None + and "POWERED_DOWN" not in state.flags + and "POWERING_DOWN" not in state.flags + and inst.status == "TERMINATED" + ): + if inst.scheduling.preemptible: + return NodeStatus.preempted + if not state.base.startswith("DOWN"): + return NodeStatus.terminated + elif (state is None or "POWERED_DOWN" in state.flags) and inst.status == "RUNNING": + log.info("%s is potential orphan node", nodename) + age_threshold_seconds = 90 + inst_seconds_old = _seconds_since_timestamp(inst.creationTimestamp) + log.info("%s state: %s, age: %0.1fs", nodename, state, inst_seconds_old) + if inst_seconds_old < age_threshold_seconds: + log.info( + "%s not marked as orphan, it started less than %ds ago (%0.1fs)", + nodename, + age_threshold_seconds, + inst_seconds_old, + ) + return NodeStatus.unchanged + return NodeStatus.orphan + elif state is None: + # if state is None here, the instance exists but it's not in Slurm + return NodeStatus.unknown + + return NodeStatus.unchanged + + +def _seconds_since_timestamp(timestamp): + """Returns duration in seconds since a timestamp + Args: + timestamp: A formatted timestamp string (%Y-%m-%dT%H:%M:%S.%f%z) + Returns: + number of seconds that have past since the timestamp (float) + """ + if timestamp[-3] == ":": # python 36 datetime does not support the colon + timestamp = timestamp[:-3] + timestamp[-2:] + creation_dt = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f%z") + return datetime.datetime.now().timestamp() - creation_dt.timestamp() + + +def do_node_update(status, nodes): + """update node/instance based on node status""" + if status == NodeStatus.unchanged: + return + count = len(nodes) + hostlist = util.to_hostlist(nodes) + + def nodes_down(): + """down nodes""" + log.info( + f"{count} nodes set down due to node status '{status.name}' ({hostlist})" + ) + run( + f"{lkp.scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'" + ) + + def nodes_restart(): + """start instances for nodes""" + log.info(f"{count} instances restarted ({hostlist})") + start_instances(nodes) + + def nodes_idle(): + """idle nodes""" + log.info(f"{count} nodes to idle ({hostlist})") + run(f"{lkp.scontrol} update nodename={hostlist} state=resume") + + def nodes_resume(): + """resume nodes via scontrol""" + log.info(f"{count} instances to resume ({hostlist})") + run(f"{lkp.scontrol} update nodename={hostlist} state=power_up") + + def nodes_delete(): + """delete instances for nodes""" + log.info(f"{count} instances to delete ({hostlist})") + delete_instances(nodes) + + def nodes_power_down(): + """power_down node in slurm""" + log.info(f"{count} instances to power down ({hostlist})") + run(f"{lkp.scontrol} update nodename={hostlist} state=power_down") + + def nodes_unknown(): + """Error status, nodes shouldn't get in this status""" + log.error(f"{count} nodes have unexpected status: ({hostlist})") + first = next(iter(nodes)) + state = lkp.slurm_node(first) + state = "{}+{}".format(state.base, "+".join(state.flags)) if state else "None" + inst = lkp.instance(first) + log.error(f"{first} state: {state}, instance status:{inst.status}") + + update = dict.get( + { + NodeStatus.orphan: nodes_delete, + NodeStatus.power_down: nodes_power_down, + NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()), + NodeStatus.restore: nodes_idle, + NodeStatus.resume: nodes_resume, + NodeStatus.terminated: nodes_down, + NodeStatus.unbacked: nodes_down, + NodeStatus.unchanged: lambda: None, + NodeStatus.unknown: nodes_unknown, + }, + status, + ) + update() + + +def delete_placement_groups(placement_groups): + def delete_placement_request(pg_name, region): + return compute.resourcePolicies().delete( + project=lkp.project, region=region, resourcePolicy=pg_name + ) + + requests = { + pg.name: delete_placement_request(pg["name"], util.trim_self_link(pg["region"])) + for pg in placement_groups + } + + def swallow_err(_: str) -> None: + pass + + done, failed = batch_execute(requests, log_err=swallow_err) + if failed: + # Filter out resourceInUseByAnotherResource errors , they are expected to happen + def ignore_err(e) -> bool: + return "resourceInUseByAnotherResource" in str(e) + + failures = [f"{n}: {e}" for n, (_, e) in failed.items() if not ignore_err(e)] + if failures: + log.error(f"some placement groups failed to delete: {failures}") + log.info( + f"deleted {len(done)} of {len(placement_groups)} placement groups ({to_hostlist_fast(done.keys())})" + ) + + +def sync_placement_groups(): + """Delete placement policies that are for jobs that have completed/terminated""" + keep_states = frozenset( + [ + "RUNNING", + "CONFIGURING", + "STOPPED", + "SUSPENDED", + "COMPLETING", + ] + ) + + if lkp.instance_role_safe != "controller": + return + + keep_jobs = { + str(job["job_id"]) + for job in json.loads(run(f"{lkp.scontrol} show jobs --json").stdout)["jobs"] + if "job_state" in job and set(job["job_state"]) & keep_states + } + keep_jobs.add("0") # Job 0 is a placeholder for static node placement + + fields = "items.regions.resourcePolicies,nextPageToken" + flt = f"name={lkp.cfg.slurm_cluster_name}-*" + act = compute.resourcePolicies() + op = act.aggregatedList(project=lkp.project, fields=fields, filter=flt) + placement_groups = {} + pg_regex = re.compile( + rf"{lkp.cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" + ) + while op is not None: + result = ensure_execute(op) + # merge placement group info from API and job_id,partition,index parsed from the name + pgs = ( + NSDict({**pg, **pg_regex.match(pg["name"]).groupdict()}) + for pg in chain.from_iterable( + item["resourcePolicies"] + for item in result.get("items", {}).values() + if item + ) + if pg_regex.match(pg["name"]) is not None + ) + placement_groups.update( + {pg["name"]: pg for pg in pgs if pg.get("job_id") not in keep_jobs} + ) + op = act.aggregatedList_next(op, result) + + if len(placement_groups) > 0: + delete_placement_groups(list(placement_groups.values())) + + +def sync_slurm(): + if lkp.instance_role_safe != "controller": + return + + compute_instances = [ + name for name, inst in lkp.instances().items() if inst.role == "compute" + ] + slurm_nodes = list(lkp.slurm_nodes().keys()) + + all_nodes = list( + set( + chain( + compute_instances, + slurm_nodes, + ) + ) + ) + log.debug( + f"reconciling {len(compute_instances)} ({len(all_nodes)-len(compute_instances)}) GCP instances and {len(slurm_nodes)} Slurm nodes ({len(all_nodes)-len(slurm_nodes)})." + ) + node_statuses = { + k: list(v) for k, v in util.groupby_unsorted(all_nodes, find_node_status) + } + if log.isEnabledFor(logging.DEBUG): + status_nodelist = { + status.name: to_hostlist_fast(nodes) + for status, nodes in node_statuses.items() + } + log.debug(f"node statuses: \n{yaml.safe_dump(status_nodelist).rstrip()}") + + for status, nodes in node_statuses.items(): + do_node_update(status, nodes) + + +def read_hash(filename): + filename = Path(filename) + if not filename.exists(): + return None + with open(filename, "r", encoding="utf-8") as file: + return file.readline() + + +def save_hash(filename, hash): + with open(filename, "w+", encoding="utf-8") as file: + file.write(hash) + + +def reconfigure_slurm(): + CONFIG_HASH = Path("/slurm/scripts/.config.hash") + update_msg = "*** slurm configuration was updated ***" + cfg_old = load_config_file(CONFIG_FILE) + + if cfg_old.hybrid: + # terraform handles generating the config.yaml, don't do it here + return + + hash_new: hashlib.md5 = fetch_config_yaml_md5() + hash_old: str = read_hash(CONFIG_HASH) + + if hash_new.hexdigest() != hash_old: + log.debug("Delta detected. Reconfiguring Slurm now.") + cfg_new = fetch_config_yaml() + save_hash(CONFIG_HASH, hash_new.hexdigest()) + save_config(cfg_new, CONFIG_FILE) + cfg_new = load_config_file(CONFIG_FILE) + lkp = Lookup(cfg_new) + util.lkp = lkp + if lkp.instance_role_safe == "controller": + install_slurm_conf(lkp) + install_slurmdbd_conf(lkp) + gen_cloud_conf(lkp) + gen_cloud_gres_conf(lkp) + gen_topology_conf(lkp) + install_gres_conf(lkp) + install_cgroup_conf(lkp) + install_topology_conf(lkp) + log.info("Restarting slurmctld to make changes take effect.") + try: + run("sudo systemctl restart slurmctld.service", check=False) + run(f"{lkp.scontrol} reconfigure", timeout=30) + except Exception as e: + log.error(e) + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") + elif lkp.instance_role_safe in ["compute", "login"]: + log.info("Restarting slurmd to make changes take effect.") + run("systemctl restart slurmd") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") + + +def main(): + try: + reconfigure_slurm() + except Exception: + log.exception("failed to reconfigure slurm") + + try: + sync_slurm() + except Exception: + log.exception("failed to sync instances") + + try: + sync_placement_groups() + except Exception: + log.exception("failed to sync placement groups") + + try: + install_custom_scripts(check_hash=True) + except Exception: + log.exception("failed to sync custom scripts") + + +parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter +) +parser.add_argument( + "--debug", + "-d", + dest="loglevel", + action="store_const", + const=logging.DEBUG, + default=logging.INFO, + help="Enable debugging output", +) +parser.add_argument( + "--trace-api", + "-t", + action="store_true", + help="Enable detailed api request output", +) +parser.add_argument( + "--force", + "-f", + action="store_true", + help="Force tasks to run, regardless of lock.", +) + +if __name__ == "__main__": + args = parser.parse_args() + util.chown_slurm(LOGFILE, mode=0o600) + + if cfg.enable_debug_logging: + args.loglevel = logging.DEBUG + if args.trace_api: + cfg.extra_logging_flags = list(cfg.extra_logging_flags) + cfg.extra_logging_flags.append("trace_api") + util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) + + sys.excepthook = util.handle_exception + + # only run one instance at a time unless --force + if args.force: + main() + else: + pid_file = (Path("/tmp") / Path(__file__).name).with_suffix(".pid") + with pid_file.open("w") as fp: + try: + fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) + main() + except BlockingIOError: + sys.exit(0) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh new file mode 100755 index 0000000000..a5ee3bc413 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# Copyright (C) SchedMD LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +SLURM_DIR=/slurm +FLAGFILE=$SLURM_DIR/slurm_configured_do_not_remove +SCRIPTS_DIR=$SLURM_DIR/scripts +if [[ -z "$HOME" ]]; then + # google-startup-scripts.service lacks environment variables + HOME="$(getent passwd "$(whoami)" | cut -d: -f6)" +fi + +METADATA_SERVER="metadata.google.internal" +URL="http://$METADATA_SERVER/computeMetadata/v1" +HEADER="Metadata-Flavor:Google" +CURL="curl -sS --fail --header $HEADER" +UNIVERSE_DOMAIN="$($CURL $URL/instance/attributes/universe_domain)" +STORAGE_CMD="CLOUDSDK_CORE_UNIVERSE_DOMAIN=$UNIVERSE_DOMAIN gcloud storage" + +function devel::zip() { + local BUCKET="$($CURL $URL/instance/attributes/slurm_bucket_path)" + if [[ -z $BUCKET ]]; then + echo "ERROR: No bucket path detected." + return 1 + fi + + local SLURM_ZIP_URL="$BUCKET/slurm-gcp-devel.zip" + local SLURM_ZIP_FILE="$HOME/slurm-gcp-devel.zip" + local SLURM_ZIP_DIR="$HOME/slurm-gcp-devel" + eval $(bash -c "$STORAGE_CMD cp $SLURM_ZIP_URL $SLURM_ZIP_FILE") + if ! [[ -f "$SLURM_ZIP_FILE" ]]; then + echo "INFO: No development files downloaded. Skipping." + return 0 + fi + unzip -o "$SLURM_ZIP_FILE" -d "$SCRIPTS_DIR" + rm -rf "$SLURM_ZIP_FILE" "$SLURM_ZIP_DIR" # Clean up + echo "INFO: Finished inflating '$SLURM_ZIP_FILE'." + + #temporary hack to not make the script fail on TPU vm + chown slurm:slurm -R "$SCRIPTS_DIR" || true + chmod 700 -R "$SCRIPTS_DIR" + echo "INFO: Updated permissions of files in '$SCRIPTS_DIR'." +} + +function config() { + local BUCKET="$($CURL $URL/instance/attributes/slurm_bucket_path)" + if [[ -z $BUCKET ]]; then + echo "ERROR: No bucket path detected." + return 1 + fi + + local SLURM_CONFIG_URL="$BUCKET/config.yaml" + local SLURM_CONFIG_FILE="$SCRIPTS_DIR/config.yaml" + eval $(bash -c "$STORAGE_CMD cp $SLURM_CONFIG_URL $SLURM_CONFIG_FILE") + if ! [[ -f "$SLURM_CONFIG_FILE" ]]; then + echo "INFO: No config file downloaded. Skipping." + return 0 + fi + + #temporary hack to not make the script fail on TPU vm + chown slurm:slurm -R "$SLURM_CONFIG_FILE" || true + chmod 600 -R "$SLURM_CONFIG_FILE" + echo "INFO: Updated permissions of '$SLURM_CONFIG_FILE'." +} + +PING_METADATA="ping -q -w1 -c1 $METADATA_SERVER" +echo "INFO: $PING_METADATA" +for i in $(seq 10); do + [ $i -gt 1 ] && sleep 5; + $PING_METADATA > /dev/null && s=0 && break || s=$?; + echo "ERROR: Failed to contact metadata server, will retry" +done +if [ $s -ne 0 ]; then + echo "ERROR: Unable to contact metadata server, aborting" + wall -n '*** Slurm setup failed in the startup script! see `journalctl -u google-startup-scripts` ***' + exit 1 +else + echo "INFO: Successfully contacted metadata server" +fi + +GOOGLE_DNS=8.8.8.8 +PING_GOOGLE="ping -q -w1 -c1 $GOOGLE_DNS" +echo "INFO: $PING_GOOGLE" +for i in $(seq 5); do + [ $i -gt 1 ] && sleep 2; + $PING_GOOGLE > /dev/null && s=0 && break || s=$?; + echo "failed to ping Google DNS, will retry" +done +if [ $s -ne 0 ]; then + echo "WARNING: No internet access detected" +else + echo "INFO: Internet access detected" +fi + +mkdir -p $SCRIPTS_DIR + +SETUP_SCRIPT_FILE=$SCRIPTS_DIR/setup.py +UTIL_SCRIPT_FILE=$SCRIPTS_DIR/util.py + +devel::zip +config + +if [ -f $FLAGFILE ]; then + echo "WARNING: Slurm was previously configured, quitting" + exit 0 +fi +touch $FLAGFILE + +function tpu_setup { + #allow the following command to fail, as this attribute does not exist for regular nodes + docker_image=$($CURL $URL/instance/attributes/slurm_docker_image 2> /dev/null || true) + if [ -z $docker_image ]; then #Not a tpu node, do not do anything + return + fi + if [ "$OS_ENV" == "slurm_container" ]; then #Already inside the slurm container, we should continue starting + return + fi + + #given a input_string like "WORKER_0:Joseph;WORKER_1:richard;WORKER_2:edward;WORKER_3:john" and a number 1, this function will print richard + parse_metadata() { + local number=$1 + local input_string=$2 + local word=$(echo "$input_string" | awk -v n="$number" -F ':|;' '{ for (i = 1; i <= NF; i+=2) if ($(i) == "WORKER_"n) print $(i+1) }') + echo "$word" + } + + input_string=$($CURL $URL/instance/attributes/slurm_names) + worker_id=$($CURL $URL/instance/attributes/tpu-env | awk '/WORKER_ID/ {print $2}' | tr -d \') + real_name=$(parse_metadata $worker_id $input_string) + + #Prepare to docker pull with gcloud + mkdir -p /root/.docker + cat << EOF > /root/.docker/config.json +{ + "credHelpers": { + "gcr.io": "gcloud", + "us-docker.pkg.dev": "gcloud" + } +} +EOF + #cgroup detection + CGV=1 + CGROUP_FLAGS="-v /sys/fs/cgroup:/sys/fs/cgroup:rw" + if [ -f /sys/fs/cgroup/cgroup.controllers ]; then #CGV2 + CGV=2 + fi + if [ $CGV == 2 ]; then + CGROUP_FLAGS="--cgroup-parent=docker.slice --cgroupns=private --tmpfs /run --tmpfs /run/lock --tmpfs /tmp" + if [ ! -f /etc/systemd/system/docker.slice ]; then #In case that there is no slice prepared for hosting the containers create it + printf "[Unit]\nDescription=docker slice\nBefore=slices.target\n[Slice]\nCPUAccounting=true\nMemoryAccounting=true" > /etc/systemd/system/docker.slice + systemctl start docker.slice + fi + fi + #for the moment always use --privileged, as systemd might not work properly otherwise + TPU_FLAGS="--privileged" + # TPU_FLAGS="--cap-add SYS_RESOURCE --device /dev/accel0 --device /dev/accel1 --device /dev/accel2 --device /dev/accel3" + # if [ $CGV == 2 ]; then #In case that we are in CGV2 for systemd to work correctly for the moment we go with privileged + # TPU_FLAGS="--privileged" + # fi + + docker run -d $CGROUP_FLAGS $TPU_FLAGS --net=host --name=slurmd --hostname=$real_name --entrypoint=/usr/bin/systemd --restart unless-stopped $docker_image + exit 0 +} + +tpu_setup #will do nothing for normal nodes or the container spawned inside TPU + +function fetch_feature { + if slurmd_feature="$($CURL $URL/instance/attributes/slurmd_feature)"; then + echo "$slurmd_feature" + else + echo "" + fi +} +SLURMD_FEATURE="$(fetch_feature)" + +echo "INFO: Running python cluster setup script" +chmod +x $SETUP_SCRIPT_FILE +python3 $SCRIPTS_DIR/util.py +if [[ -n "$SLURMD_FEATURE" ]]; then + echo "INFO: Running dynamic node setup." + exec $SETUP_SCRIPT_FILE --slurmd-feature="$SLURMD_FEATURE" +else + exec $SETUP_SCRIPT_FILE +fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py new file mode 100755 index 0000000000..af70d97679 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +import argparse +import logging +import sys +from pathlib import Path + +import util +from util import ( + groupby_unsorted, + log_api_request, + batch_execute, + to_hostlist_fast, + wait_for_operations, + separate, + execute_with_futures, +) +from util import lkp, cfg, compute, TPU + +import slurm_gcp_plugins + +filename = Path(__file__).name +LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") +log = logging.getLogger(filename) + +TOT_REQ_CNT = 1000 + + +def truncate_iter(iterable, max_count): + end = "..." + _iter = iter(iterable) + for i, el in enumerate(_iter, start=1): + if i >= max_count: + yield end + break + yield el + + +def delete_instance_request(instance, project=None, zone=None): + project = project or lkp.project + request = compute.instances().delete( + project=project, + zone=(zone or lkp.instance(instance).zone), + instance=instance, + ) + log_api_request(request) + return request + + +def stop_tpu(data): + tpu_nodeset = data["nodeset"] + node = data["node"] + tpu = data["tpu"] + if tpu_nodeset.preserve_tpu and tpu.vmcount == 1: + log.info(f"stopping node {node}") + if tpu.stop_node(node): + return + log.error("Error stopping node {node} will delete instead") + log.info(f"deleting node {node}") + if not tpu.delete_node(node): + log.error("Error deleting node {node}") + + +def delete_tpu_instances(instances): + stop_data = [] + for prefix, nodes in util.groupby_unsorted(instances, lkp.node_prefix): + log.info(f"Deleting TPU nodes from prefix {prefix}") + lnodes = list(nodes) + tpu_nodeset = lkp.node_nodeset(lnodes[0]) + tpu = TPU(tpu_nodeset) + stop_data.extend( + [{"tpu": tpu, "node": node, "nodeset": tpu_nodeset} for node in lnodes] + ) + execute_with_futures(stop_tpu, stop_data) + + +def delete_instances(instances): + """delete instances individually""" + invalid, valid = separate(lambda inst: bool(lkp.instance(inst)), instances) + if len(invalid) > 0: + log.debug("instances do not exist: {}".format(",".join(invalid))) + if len(valid) == 0: + log.debug("No instances to delete") + return + + requests = {inst: delete_instance_request(inst) for inst in valid} + + log.info(f"delete {len(valid)} instances ({to_hostlist_fast(valid)})") + done, failed = batch_execute(requests) + if failed: + for err, nodes in groupby_unsorted(lambda n: failed[n][1], failed.keys()): + log.error(f"instances failed to delete: {err} ({to_hostlist_fast(nodes)})") + wait_for_operations(done.values()) + # TODO do we need to check each operation for success? That is a lot more API calls + log.info(f"deleted {len(done)} instances {to_hostlist_fast(done.keys())}") + + +def suspend_nodes(nodes: List[str]) -> None: + tpu_nodes, other_nodes = [], [] + for node in nodes[:]: + if lkp.node_is_tpu(node): + tpu_nodes.append(node) + else: + other_nodes.append(node) + + delete_instances(other_nodes) + delete_tpu_instances(tpu_nodes) + + +def main(nodelist): + """main called when run as script""" + log.debug(f"SuspendProgram {nodelist}") + + # Filter out nodes not in config.yaml + other_nodes, pm_nodes = separate( + lkp.is_power_managed_node, util.to_hostnames(nodelist) + ) + if other_nodes: + log.debug( + f"Ignoring non-power-managed nodes '{to_hostlist_fast(other_nodes)}' from '{nodelist}'" + ) + if pm_nodes: + log.debug(f"Suspending nodes '{to_hostlist_fast(pm_nodes)}' from '{nodelist}'") + else: + log.debug("No cloud nodes to suspend") + return + + log.info(f"suspend {nodelist}") + if lkp.cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.pre_main_suspend_nodes(lkp=lkp, nodelist=nodelist) + suspend_nodes(pm_nodes) + + +parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter +) +parser.add_argument("nodelist", help="list of nodes to suspend") +parser.add_argument( + "--debug", + "-d", + dest="loglevel", + action="store_const", + const=logging.DEBUG, + default=logging.INFO, + help="Enable debugging output", +) +parser.add_argument( + "--trace-api", + "-t", + action="store_true", + help="Enable detailed api request output", +) + + +if __name__ == "__main__": + args = parser.parse_args() + + if cfg.enable_debug_logging: + args.loglevel = logging.DEBUG + if args.trace_api: + cfg.extra_logging_flags = list(cfg.extra_logging_flags) + cfg.extra_logging_flags.append("trace_api") + util.chown_slurm(LOGFILE, mode=0o600) + util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) + log = logging.getLogger(Path(__file__).name) + sys.excepthook = util.handle_exception + + main(args.nodelist) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md new file mode 100644 index 0000000000..8452813f25 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md @@ -0,0 +1,6 @@ +# Unit tests + +```sh +# cwd is scripts/tests +$ pytest -W ignore::DeprecationWarning +``` diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py new file mode 100644 index 0000000000..fc1f249cf0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -0,0 +1,120 @@ +from typing import Optional +import mock +import sys + +if ".." not in sys.path: + sys.path.append("..") # TODO: make this more robust +import util +import conf + +from dataclasses import dataclass, field +import tempfile + + +# TODO: use "real" classes once they are defined (instead of NSDict) +@dataclass +class TstNodeset: + nodeset_name: str + node_count_static: int = 0 + node_count_dynamic_max: int = 0 + + +@dataclass +class TstCfg: + slurm_cluster_name: str = "m22" + nodeset: dict[str, TstNodeset] = field(default_factory=dict) + nodeset_tpu: dict[str, TstNodeset] = field(default_factory=dict) + output_dir: Optional[str] = None + + +@dataclass +class TstTPU: # to prevent client initialization durint "TPU.__init__" + vmcount: int + + +def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): + tbl = tbl or {} + + def se(k: str) -> list[str]: + if k not in tbl: + raise AssertionError(f"to_hostnames mock: unexpected nodelist: '{k}'") + return tbl[k] + + return se + + +def test_gen_topology_conf_empty(): + cfg = TstCfg(output_dir=tempfile.mkdtemp()) + conf.gen_topology_conf(util.Lookup(cfg)) + assert ( + open(cfg.output_dir + "/cloud_topology.conf").read() + == """ +# Warning: +# This file is managed by a script. Manual modifications will be overwritten. + + +""" + ) + + +@mock.patch("util.TPU") +@mock.patch( + "util.to_hostnames", + side_effect=make_to_hostnames_mock( + { + "m22-bold-[0-3]": ["m22-bold-0", "m22-bold-1", "m22-bold-2", "m22-bold-3"], + "m22-bold-[4-8]": [ + "m22-bold-4", + "m22-bold-5", + "m22-bold-6", + "m22-bold-7", + "m22-bold-8", + ], + "m22-slim-[0-2]": ["m22-slim-0", "m22-slim-1", "m22-slim-2"], + } + ), +) +def test_gen_topology_conf(to_hostnames_mock, tpu_mock): + cfg = TstCfg( + nodeset_tpu={ + "a": TstNodeset("bold", node_count_static=4, node_count_dynamic_max=5), + "b": TstNodeset("slim", node_count_dynamic_max=3), + }, + nodeset={ + "c": TstNodeset("green", node_count_static=2, node_count_dynamic_max=3), + "d": TstNodeset("blue", node_count_static=7), + "e": TstNodeset("pink", node_count_dynamic_max=4), + }, + output_dir=tempfile.mkdtemp(), + ) + + def tpu_se(ns: TstNodeset) -> TstTPU: + if ns.nodeset_name == "bold": + return TstTPU(vmcount=3) + if ns.nodeset_name == "slim": + return TstTPU(vmcount=1) + raise AssertionError(f"unexpected TPU name: '{ns.nodeset_name}'") + + tpu_mock.side_effect = tpu_se + + conf.gen_topology_conf(util.Lookup(cfg)) + assert ( + open(cfg.output_dir + "/cloud_topology.conf").read() + == """ +# Warning: +# This file is managed by a script. Manual modifications will be overwritten. + +SwitchName=nodeset-root Switches=blue,green,pink +SwitchName=blue Nodes=m22-blue-[0-6] +SwitchName=green Nodes=m22-green-[0-4] +SwitchName=pink Nodes=m22-pink-[0-3] +SwitchName=nodeset_tpu-root Switches=bold,slim +SwitchName=bold Switches=bold-[0-3] +SwitchName=bold-0 Nodes=m22-bold-[0-2] +SwitchName=bold-1 Nodes=m22-bold-3 +SwitchName=bold-2 Nodes=m22-bold-[4-6] +SwitchName=bold-3 Nodes=m22-bold-[7-8] +SwitchName=slim Nodes=m22-slim-[0-2] + +""" + ) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py new file mode 100644 index 0000000000..9c3a03c210 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -0,0 +1,148 @@ +import sys +import pytest + +if ".." not in sys.path: + sys.path.append("..") # TODO: make this more robust +import util +from google.api_core.client_options import ClientOptions # noqa: E402 + +# Note: need to install pytest-mock + + +@pytest.mark.parametrize( + "name,expected", + [ + ( + "az-buka-23", + { + "cluster": "az", + "nodeset": "buka", + "node": "23", + "prefix": "az-buka", + "range": None, + "suffix": "23", + }, + ), + ( + "az-buka-xyzf", + { + "cluster": "az", + "nodeset": "buka", + "node": "xyzf", + "prefix": "az-buka", + "range": None, + "suffix": "xyzf", + }, + ), + ( + "az-buka-[2-3]", + { + "cluster": "az", + "nodeset": "buka", + "node": "[2-3]", + "prefix": "az-buka", + "range": "[2-3]", + "suffix": None, + }, + ), + ], +) +def test_node_desc(name, expected): + assert util.lkp._node_desc(name) == expected + + +@pytest.mark.parametrize( + "name", + [ + "az-buka", + ], +) +def test_node_desc_fail(name): + with pytest.raises(Exception): + util.lkp._node_desc(name) + + +@pytest.mark.parametrize( + "names,expected", + [ + ("pedro,pedro-1,pedro-2,pedro-01,pedro-02", "pedro,pedro-[1-2,01-02]"), + ("pedro,,pedro-1,,pedro-2", "pedro,pedro-[1-2]"), + ("pedro-8,pedro-9,pedro-10,pedro-11", "pedro-[8-9,10-11]"), + ("pedro-08,pedro-09,pedro-10,pedro-11", "pedro-[08-11]"), + ("pedro-08,pedro-09,pedro-8,pedro-9", "pedro-[8-9,08-09]"), + ("pedro-10,pedro-08,pedro-09,pedro-8,pedro-9", "pedro-[8-9,08-10]"), + ("pedro-8,pedro-9,juan-10,juan-11", "juan-[10-11],pedro-[8-9]"), + ("az,buki,vedi", "az,buki,vedi"), + ("a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12", "a[0-9,10-12]"), + ("a0,a2,a4,a6,a7,a8,a11,a12", "a[0,2,4,6-8,11-12]"), + ("seas7-0,seas7-1", "seas7-[0-1]"), + ], +) +def test_to_hostlist_fast(names, expected): + assert util.to_hostlist_fast(names.split(",")) == expected + + +@pytest.mark.parametrize( + "api,ep_ver,expected", + [ + ( + util.ApiEndpoint.BQ, + "v1", + ClientOptions( + api_endpoint="https://bq.googleapis.com/v1/", + universe_domain="googleapis.com", + ), + ), + ( + util.ApiEndpoint.COMPUTE, + "staging_v1", + ClientOptions( + api_endpoint="https://compute.googleapis.com/staging_v1/", + universe_domain="googleapis.com", + ), + ), + ( + util.ApiEndpoint.SECRET, + "v1", + ClientOptions( + api_endpoint="https://secret_manager.googleapis.com/v1/", + universe_domain="googleapis.com", + ), + ), + ( + util.ApiEndpoint.STORAGE, + "beta", + ClientOptions( + api_endpoint="https://storage.googleapis.com/beta/", + universe_domain="googleapis.com", + ), + ), + ( + util.ApiEndpoint.TPU, + "alpha", + ClientOptions( + api_endpoint="https://tpu.googleapis.com/alpha/", + universe_domain="googleapis.com", + ), + ), + ], +) +def test_create_client_options( + api: util.ApiEndpoint, ep_ver: str, expected: ClientOptions, mocker +): + ud_mock = mocker.patch("util.universe_domain") + ep_mock = mocker.patch("util.endpoint_version") + ud_mock.return_value = "googleapis.com" + ep_mock.return_value = ep_ver + co = util.create_client_options(api) + assert ( + co.api_endpoint == expected.api_endpoint + and co.universe_domain == expected.universe_domain + ) + ud_mock.return_value = None + ep_mock.return_value = None + co = util.create_client_options(api) + assert ( + co.api_endpoint != expected.api_endpoint + and co.universe_domain != expected.universe_domain + ) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py new file mode 100755 index 0000000000..c77bcd9932 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -0,0 +1,2083 @@ +#!/usr/bin/env python3 + +# Copyright (C) SchedMD LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable, List, Tuple, Optional +import argparse +import base64 +import collections +import hashlib +import importlib.util +import inspect +import json +import logging +import logging.config +import math +import os +import re +import shelve +import shlex +import shutil +import socket +import subprocess +import sys +import tempfile +from enum import Enum +from collections import defaultdict, namedtuple +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import contextmanager +from functools import lru_cache, reduce, wraps +from itertools import chain, compress, islice +from pathlib import Path +from time import sleep, time + +import slurm_gcp_plugins + +required_modules = [ + ("googleapiclient", "google-api-python-client"), + ("requests", "requests"), + ("yaml", "yaml"), + ("addict", "addict"), + ("httplib2", "httplib2"), + ("google.cloud.tpu_v2", "google-cloud-tpu"), +] +missing_imports = False +can_tpu = True +for module, name in required_modules: + if importlib.util.find_spec(module) is None: + if module == "google.cloud.tpu_v2": + can_tpu = False + print( + f"WARNING: Missing Python module '{module} (pip:{name})', TPU support will not work." + ) + else: + missing_imports = True + print(f"ERROR: Missing Python module '{module} (pip:{name})'") +if missing_imports: + print("Aborting due to missing Python modules") + exit(1) + +import google.auth # noqa: E402 +from google.oauth2 import service_account # noqa: E402 +import googleapiclient.discovery # noqa: E402 +import google_auth_httplib2 # noqa: E402 +from googleapiclient.http import set_user_agent # noqa: E402 +from google.api_core.client_options import ClientOptions # noqa: E402 +import httplib2 # noqa: E402 + +if can_tpu: + from google.cloud import tpu_v2 as tpu # noqa: E402 +import google.api_core.exceptions as gExceptions # noqa: E402 + +from requests import get as get_url # noqa: E402 +from requests.exceptions import RequestException # noqa: E402 + +import yaml # noqa: E402 +from addict import Dict as NSDict # noqa: E402 + +optional_modules = [ + ("google.cloud.secretmanager", "google-cloud-secret-manager"), +] +for module, name in optional_modules: + if importlib.util.find_spec(module) is None: + print(f"WARNING: Missing Python module '{module}' (pip:{name}) ") + +USER_AGENT = "Slurm_GCP_Scripts/1.5 (GPN:SchedMD)" +ENV_CONFIG_YAML = os.getenv("SLURM_CONFIG_YAML") +if ENV_CONFIG_YAML: + CONFIG_FILE = Path(ENV_CONFIG_YAML) +else: + CONFIG_FILE = Path(__file__).with_name("config.yaml") +API_REQ_LIMIT = 2000 +URI_REGEX = r"[a-z]([-a-z0-9]*[a-z0-9])?" + + +def mkdirp(path: Path) -> None: + path.mkdir(parents=True, exist_ok=True) + + +scripts_dir = next( + p for p in (Path(__file__).parent, Path("/slurm/scripts")) if p.is_dir() +) + +# readily available compute api handle +compute = None +# slurm-gcp config object, could be empty if not available +cfg = NSDict() +# caching Lookup object +lkp = None + +# load all directories as Paths into a dict-like namespace +dirs = NSDict( + { + n: Path(p) + for n, p in dict.items( + { + "home": "/home", + "apps": "/opt/apps", + "slurm": "/slurm", + "scripts": scripts_dir, + "custom_scripts": "/slurm/custom_scripts", + "munge": "/etc/munge", + "secdisk": "/mnt/disks/sec", + "log": "/var/log/slurm", + } + ) + } +) + +slurmdirs = NSDict( + { + n: Path(p) + for n, p in dict.items( + { + "prefix": "/usr/local", + "etc": "/usr/local/etc/slurm", + "state": "/var/spool/slurm", + } + ) + } +) + + +yaml.SafeDumper.yaml_representers[ + None +] = lambda self, data: yaml.representer.SafeRepresenter.represent_str(self, str(data)) + + +class ApiEndpoint(Enum): + COMPUTE = "compute" + BQ = "bq" + STORAGE = "storage" + TPU = "tpu" + SECRET = "secret_manager" + + +@lru_cache(maxsize=1) +def default_credentials(): + return google.auth.default()[0] + + +@lru_cache(maxsize=1) +def authentication_project(): + return google.auth.default()[1] + + +def universe_domain() -> str: + return instance_metadata("attributes/universe_domain") + + +def endpoint_version(api: ApiEndpoint) -> Optional[str]: + if api and api.value in lkp.endpoint_versions: + return lkp.endpoint_versions[api.value] + return None + + +@lru_cache(maxsize=1) +def get_credentials() -> Optional[service_account.Credentials]: + """Get credentials for service account""" + key_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + if key_path is not None: + credentials = service_account.Credentials.from_service_account_file( + key_path, scopes=[f"https://www.{universe_domain()}/auth/cloud-platform"] + ) + else: + credentials = default_credentials() + + return credentials + + +def create_client_options(api: ApiEndpoint = None) -> ClientOptions: + """Create client options for cloud endpoints""" + ep = None + ver = endpoint_version(api) + ud = universe_domain() + if ver: + ep = f"https://{api.value}.{ud}/{ver}/" + log.debug( + f"Using universe domain: {ud}. " + + ( + f"For API: {api.value} using API endpoint: " f"{ep if ep else 'default'}" + if api + else "" + ) + ) + return ClientOptions( + universe_domain=ud, + api_endpoint=ep, + ) + + +class LogFormatter(logging.Formatter): + """adds logging flags to the levelname in log records""" + + def format(self, record): + new_fmt = self._fmt + flag = getattr(record, "flag", None) + if flag is not None: + start, level, end = new_fmt.partition("%(levelname)s") + if level: + new_fmt = f"{start}{level}(%(flag)s){end}" + # insert function name if record level is DEBUG + if record.levelno < logging.INFO: + prefix, msg, suffix = new_fmt.partition("%(message)s") + new_fmt = f"{prefix}%(funcName)s: {msg}{suffix}" + self._style._fmt = new_fmt + return super().format(record) + + +class FlagLogAdapter(logging.LoggerAdapter): + """creates log adapters that add a flag to the log record, + allowing it to be filtered""" + + def __init__(self, logger, flag, extra=None): + if extra is None: + extra = {} + self.flag = flag + super().__init__(logger, extra) + + @property + def enabled(self): + return cfg.extra_logging_flags.get(self.flag, False) + + def process(self, msg, kwargs): + extra = kwargs.setdefault("extra", {}) + extra.update(self.extra) + extra["flag"] = self.flag + return msg, kwargs + + +logging.basicConfig(level=logging.INFO, stream=sys.stdout) +log = logging.getLogger(__name__) +logging_flags = [ + "trace_api", + "subproc", + "hostlists", +] +log_trace_api = FlagLogAdapter(log, "trace_api") +log_subproc = FlagLogAdapter(log, "subproc") +log_hostlists = FlagLogAdapter(log, "hostlists") + + +def access_secret_version(project_id, secret_id, version_id="latest"): + """ + Access the payload for the given secret version if one exists. The version + can be a version number as a string (e.g. "5") or an alias (e.g. "latest"). + """ + from google.cloud import secretmanager + from google.api_core import exceptions + + co = create_client_options(ApiEndpoint.SECRET) + client = secretmanager.SecretManagerServiceClient(client_options=co) + name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" + try: + response = client.access_secret_version(request={"name": name}) + log.debug(f"Secret '{name}' was found.") + payload = response.payload.data.decode("UTF-8") + except exceptions.NotFound: + log.debug(f"Secret '{name}' was not found!") + payload = None + + return payload + + +def parse_self_link(self_link: str): + """Parse a selfLink url, extracting all useful values + https://.../v1/projects//regions//... + {'project': , 'region': , ...} + can also extract zone, instance (name), image, etc + """ + link_patt = re.compile(r"(?P[^\/\s]+)s\/(?P[^\s\/]+)") + return NSDict(link_patt.findall(self_link)) + + +def parse_bucket_uri(uri: str): + """ + Parse a bucket url + E.g. gs:/// + """ + pattern = re.compile(r"gs://(?P[^/\s]+)/(?P([^/\s]+)(/[^/\s]+)*)") + matches = pattern.match(uri) + return matches.group("bucket"), matches.group("path") + + +def trim_self_link(link: str): + """get resource name from self link url, eg. + https://.../v1/projects//regions/ + -> + """ + try: + return link[link.rindex("/") + 1 :] + except ValueError: + raise Exception(f"'/' not found, not a self link: '{link}' ") + + +def execute_with_futures(func, seq): + with ThreadPoolExecutor() as exe: + futures = [] + for i in seq: + future = exe.submit(func, i) + futures.append(future) + for future in as_completed(futures): + result = future.exception() + if result is not None: + raise result + + +def map_with_futures(func, seq): + with ThreadPoolExecutor() as exe: + futures = [] + for i in seq: + future = exe.submit(func, i) + futures.append(future) + for future in futures: + # Will be result or raise Exception + res = None + try: + res = future.result() + except Exception as e: + res = e + yield res + + +def blob_get(file, project=None): + from google.cloud import storage + + if project is None: + project = lkp.project + uri = instance_metadata("attributes/slurm_bucket_path") + bucket_name, path = parse_bucket_uri(uri) + blob_name = f"{path}/{file}" + co = create_client_options(ApiEndpoint.STORAGE) + storage_client = storage.Client(project=project, client_options=co) + return storage_client.get_bucket(bucket_name).blob(blob_name) + + +def blob_list(prefix="", delimiter=None, project=None): + from google.cloud import storage + + if project is None: + project = lkp.project + uri = instance_metadata("attributes/slurm_bucket_path") + bucket_name, path = parse_bucket_uri(uri) + blob_prefix = f"{path}/{prefix}" + co = create_client_options(ApiEndpoint.STORAGE) + storage_client = storage.Client(project=project, client_options=co) + # Note: The call returns a response only when the iterator is consumed. + blobs = storage_client.list_blobs( + bucket_name, prefix=blob_prefix, delimiter=delimiter + ) + return [blob for blob in blobs] + + +def _hash_file(fullpath): + with open(fullpath, "rb") as f: + file_hash = hashlib.md5() + chunk = f.read(8192) + while chunk: + file_hash.update(chunk) + chunk = f.read(8192) + return base64.b64encode(file_hash.digest()).decode("utf-8") + + +def install_custom_scripts(check_hash=False): + """download custom scripts from gcs bucket""" + + compute_tokens = ["compute", "prolog", "epilog"] + if lkp.instance_role == "compute": + try: + compute_tokens.append(f"nodeset-{lkp.node_nodeset_name()}") + except Exception as e: + log.error(f"Failed to lookup nodeset: {e}") + + prefix_tokens = dict.get( + { + "login": ["login"], + "compute": compute_tokens, + "controller": ["controller", "prolog", "epilog"], + }, + lkp.instance_role, + [], + ) + prefixes = [f"slurm-{tok}-script" for tok in prefix_tokens] + blobs = list(chain.from_iterable(blob_list(prefix=p) for p in prefixes)) + + script_pattern = re.compile(r"slurm-(?P\S+)-script-(?P\S+)") + for blob in blobs: + m = script_pattern.match(Path(blob.name).name) + if not m: + log.warning(f"found blob that doesn't match expected pattern: {blob.name}") + continue + path_parts = m["path"].split("-") + path_parts[0] += ".d" + stem, _, ext = m["name"].rpartition("_") + filename = ".".join((stem, ext)) + + path = Path(*path_parts, filename) + fullpath = (dirs.custom_scripts / path).resolve() + mkdirp(fullpath.parent) + + for par in path.parents: + chown_slurm(dirs.custom_scripts / par) + need_update = True + if check_hash and fullpath.exists(): + need_update = _hash_file(fullpath) != blob.md5_hash + if need_update: + log.info(f"installing custom script: {path} from {blob.name}") + with fullpath.open("wb") as f: + blob.download_to_file(f) + chown_slurm(fullpath, mode=0o755) + + +def reservation_resource_policies(reservation): + """ + Inspects reservation object, returns list of resource policies names. + Converts policy URLs to names, e.g.: + projects/111111/regions/us-central1/resourcePolicies/zebra -> zebra + """ + return [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] + + +def compute_service(credentials=None, user_agent=USER_AGENT, version="beta"): + """Make thread-safe compute service handle + creates a new Http for each request + """ + + credentials = get_credentials() + + def build_request(http, *args, **kwargs): + new_http = httplib2.Http() + if user_agent is not None: + new_http = set_user_agent(new_http, user_agent) + if credentials is not None: + new_http = google_auth_httplib2.AuthorizedHttp(credentials, http=new_http) + return googleapiclient.http.HttpRequest(new_http, *args, **kwargs) + + ver = endpoint_version(ApiEndpoint.COMPUTE) + disc_url = googleapiclient.discovery.DISCOVERY_URI + if ver: + version = ver + disc_url = disc_url.replace("googleapis.com", universe_domain()) + + log.debug(f"Using version={version} of Google Compute Engine API") + return googleapiclient.discovery.build( + "compute", + version, + requestBuilder=build_request, + credentials=credentials, + discoveryServiceUrl=disc_url, + ) + + +def load_config_data(config): + """load dict-like data into a config object""" + cfg = NSDict(config) + if not cfg.slurm_log_dir: + cfg.slurm_log_dir = dirs.log + if not cfg.slurm_bin_dir: + cfg.slurm_bin_dir = slurmdirs.prefix / "bin" + if not cfg.slurm_control_host: + cfg.slurm_control_host = f"{cfg.slurm_cluster_name}-controller" + if not cfg.slurm_control_host_port: + cfg.slurm_control_host_port = "6820-6830" + if not cfg.munge_mount: + # NOTE: should only happen with cloud controller + cfg.munge_mount = NSDict( + { + "server_ip": cfg.slurm_control_addr or cfg.slurm_control_host, + "remote_mount": "/etc/munge", + "fs_type": "nfs", + "mount_options": "defaults,hard,intr,_netdev", + } + ) + + if not cfg.enable_debug_logging and isinstance(cfg.enable_debug_logging, NSDict): + cfg.enable_debug_logging = False + cfg.extra_logging_flags = NSDict( + {flag: cfg.extra_logging_flags.get(flag, False) for flag in logging_flags} + ) + return cfg + + +def new_config(config): + """initialize a new config object + necessary defaults are handled here + """ + cfg = load_config_data(config) + + network_storage_iter = filter( + None, + ( + *cfg.network_storage, + *cfg.login_network_storage, + *chain.from_iterable(ns.network_storage for ns in cfg.nodeset.values()), + *chain.from_iterable(ns.network_storage for ns in cfg.nodeset_dyn.values()), + *chain.from_iterable(ns.network_storage for ns in cfg.nodeset_tpu.values()), + ), + ) + for netstore in network_storage_iter: + if netstore != "gcsfuse" and ( + netstore.server_ip is None or netstore.server_ip == "$controller" + ): + netstore.server_ip = cfg.slurm_control_host + return cfg + + +def fetch_config_yaml(): + """Fetch config.yaml from bucket""" + config_yaml = blob_get("config.yaml").download_as_text() + cfg = new_config(yaml.safe_load(config_yaml)) + return cfg + + +def fetch_config_yaml_md5(): + """Fetch config.yaml blob md5 from bucket""" + import hashlib + + blob = blob_get("config.yaml") + blob.reload() # Populate blob with metadata + hash_str = str(blob.md5_hash).encode(encoding="utf-8") + return hashlib.md5(hash_str) + + +def load_config_file(path): + """load config from file""" + content = None + try: + content = yaml.safe_load(Path(path).read_text()) + except FileNotFoundError: + log.warning(f"config file not found: {path}") + return NSDict() + return load_config_data(content) + + +def save_config(cfg, path): + """save given config to file at path""" + Path(path).write_text(yaml.dump(cfg, Dumper=Dumper)) + + +def filter_logging_flags(record): + """logging filter for flags + if there are no flags, always pass. If there are flags, only pass if a flag + matches an enabled flag in cfg.extra_logging_flags""" + flag = getattr(record, "flag", None) + if flag is None: + return True + return cfg.extra_logging_flags.get(flag, False) + + +def owned_file_handler(filename): + """create file handler""" + if filename is None: + return None + chown_slurm(filename) + return logging.handlers.WatchedFileHandler(filename, delay=True) + + +def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): + """configure the root logger, disabling all existing loggers""" + handlers = list(compress(("stdout_handler", "file_handler"), (stdout, logfile))) + + config = { + "version": 1, + "disable_existing_loggers": True, + "formatters": { + "standard": { + "()": LogFormatter, + "fmt": "%(levelname)s: %(message)s", + }, + "stamp": { + "()": LogFormatter, + "fmt": "%(asctime)s %(levelname)s: %(message)s", + }, + }, + "filters": { + "logging_flags": {"()": lambda: filter_logging_flags}, + }, + "handlers": { + "stdout_handler": { + "level": logging.DEBUG, + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": sys.stdout, + "filters": ["logging_flags"], + }, + "file_handler": { + "()": owned_file_handler, + "level": logging.DEBUG, + "formatter": "stamp", + "filters": ["logging_flags"], + "filename": logfile, + }, + }, + "root": { + "handlers": handlers, + "level": level, + }, + } + if not logfile: + del config["handlers"]["file_handler"] + logging.config.dictConfig(config) + loggers = ( + __name__, + "resume", + "suspend", + "slurmsync", + "setup", + caller_logger, + ) + for logger in map(logging.getLogger, loggers): + logger.disabled = False + + +def log_api_request(request): + """log.trace info about a compute API request""" + if log_trace_api.enabled: + # output the whole request object as pretty yaml + # the body is nested json, so load it as well + rep = json.loads(request.to_json()) + if rep.get("body", None) is not None: + rep["body"] = json.loads(rep["body"]) + pretty_req = yaml.safe_dump(rep).rstrip() + # label log message with the calling function + log_trace_api.debug(f"{inspect.stack()[1].function}:\n{pretty_req}") + + +def handle_exception(exc_type, exc_value, exc_trace): + """log exceptions other than KeyboardInterrupt""" + # TODO does this work? + if not issubclass(exc_type, KeyboardInterrupt): + log.exception("Fatal exception", exc_info=(exc_type, exc_value, exc_trace)) + sys.__excepthook__(exc_type, exc_value, exc_trace) + + +def run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=False, + timeout=None, + check=True, + universal_newlines=True, + **kwargs, +): + """Wrapper for subprocess.run() with convenient defaults""" + if isinstance(args, list): + args = list(filter(lambda x: x is not None, args)) + args = " ".join(args) + if not shell and isinstance(args, str): + args = shlex.split(args) + log_subproc.debug(f"run: {args}") + result = subprocess.run( + args, + stdout=stdout, + stderr=stderr, + shell=shell, + timeout=timeout, + check=check, + universal_newlines=universal_newlines, + **kwargs, + ) + return result + + +def spawn(cmd, quiet=False, shell=False, **kwargs): + """nonblocking spawn of subprocess""" + if not quiet: + log_subproc.debug(f"spawn: {cmd}") + args = cmd if shell else shlex.split(cmd) + return subprocess.Popen(args, shell=shell, **kwargs) + + +def chown_slurm(path: Path, mode=None) -> None: + if path.exists(): + if mode: + path.chmod(mode) + else: + mkdirp(path.parent) + if mode: + path.touch(mode=mode) + else: + path.touch() + try: + shutil.chown(path, user="slurm", group="slurm") + except LookupError: + log.warning(f"User 'slurm' does not exist. Cannot 'chown slurm:slurm {path}'.") + except PermissionError: + log.warning(f"Not authorized to 'chown slurm:slurm {path}'.") + except Exception as err: + log.error(err) + + +@contextmanager +def cd(path): + """Change working directory for context""" + prev = Path.cwd() + os.chdir(path) + try: + yield + finally: + os.chdir(prev) + + +def cached_property(f): + return property(lru_cache()(f)) + + +def retry(max_retries: int, init_wait_time: float, warn_msg: str, exc_type: Exception): + """Retries functions that raises the exception exc_type. + Retry time is increased by a factor of two for every iteration. + + Args: + max_retries (int): Maximum number of retries + init_wait_time (float): Initial wait time in secs + warn_msg (str): Message to print during retries + exc_type (Exception): Exception type to check for + """ + + if max_retries <= 0: + raise ValueError("Incorrect value for max_retries, must be >= 1") + if init_wait_time <= 0.0: + raise ValueError("Invalid value for init_wait_time, must be > 0.0") + + def decorator(f): + @wraps(f) + def wrapper(*args, **kwargs): + retry = 0 + secs = init_wait_time + captured_exc = None + while retry < max_retries: + try: + return f(*args, **kwargs) + except exc_type as e: + captured_exc = e + log.warn(f"{warn_msg}, retrying in {secs}") + sleep(secs) + retry += 1 + secs *= 2 + raise captured_exc + + return wrapper + + return decorator + + +def separate(pred, coll): + """filter into 2 lists based on pred returning True or False + returns ([False], [True]) + """ + return reduce(lambda acc, el: acc[pred(el)].append(el) or acc, coll, ([], [])) + + +def chunked(iterable, n=API_REQ_LIMIT): + """group iterator into chunks of max size n""" + it = iter(iterable) + while True: + chunk = list(islice(it, n)) + if not chunk: + return + yield chunk + + +def groupby_unsorted(seq, key): + indices = defaultdict(list) + for i, el in enumerate(seq): + indices[key(el)].append(i) + for k, idxs in indices.items(): + yield k, (seq[i] for i in idxs) + + +@lru_cache(maxsize=32) +def find_ratio(a, n, s, r0=None): + """given the start (a), count (n), and sum (s), find the ratio required""" + if n == 2: + return s / a - 1 + an = a * n + if n == 1 or s == an: + return 1 + if r0 is None: + # we only need to know which side of 1 to guess, and the iteration will work + r0 = 1.1 if an < s else 0.9 + + # geometric sum formula + def f(r): + return a * (1 - r**n) / (1 - r) - s + + # derivative of f + def df(r): + rm1 = r - 1 + rn = r**n + return (a * (rn * (n * rm1 - r) + r)) / (r * rm1**2) + + MIN_DR = 0.0001 # negligible change + r = r0 + # print(f"r(0)={r0}") + MAX_TRIES = 64 + for i in range(1, MAX_TRIES + 1): + try: + dr = f(r) / df(r) + except ZeroDivisionError: + log.error(f"Failed to find ratio due to zero division! Returning r={r0}") + return r0 + r = r - dr + # print(f"r({i})={r}") + # if the change in r is small, we are close enough + if abs(dr) < MIN_DR: + break + else: + log.error(f"Could not find ratio after {MAX_TRIES}! Returning r={r0}") + return r0 + return r + + +def backoff_delay(start, timeout=None, ratio=None, count: int = 0): + """generates `count` waits starting at `start` + sum of waits is `timeout` or each one is `ratio` bigger than the last + the last wait is always 0""" + # timeout or ratio must be set but not both + assert (timeout is None) ^ (ratio is None) + assert ratio is None or ratio > 0 + assert timeout is None or timeout >= start + assert (count > 1 or timeout is not None) and isinstance(count, int) + assert start > 0 + + if count == 0: + # Equation for auto-count is tuned to have a max of + # ~int(timeout) counts with a start wait of <0.01. + # Increasing start wait decreases count eg. + # backoff_delay(10, timeout=60) -> count = 5 + count = int( + (timeout / ((start + 0.05) ** (1 / 2)) + 2) // math.log(timeout + 2) + ) + + yield start + # if ratio is set: + # timeout = start * (1 - ratio**(count - 1)) / (1 - ratio) + if ratio is None: + ratio = find_ratio(start, count - 1, timeout) + + wait = start + # we have start and 0, so we only need to generate count - 2 + for _ in range(count - 2): + wait *= ratio + yield wait + yield 0 + return + + +ROOT_URL = "http://metadata.google.internal/computeMetadata/v1" + + +def get_metadata(path, root=ROOT_URL): + """Get metadata relative to metadata/computeMetadata/v1""" + HEADERS = {"Metadata-Flavor": "Google"} + url = f"{root}/{path}" + try: + resp = get_url(url, headers=HEADERS) + resp.raise_for_status() + return resp.text + except RequestException: + log.debug(f"metadata not found ({url})") + raise Exception(f"failed to get_metadata from {url}") + + +@lru_cache(maxsize=None) +def instance_metadata(path): + """Get instance metadata""" + return get_metadata(path, root=f"{ROOT_URL}/instance") + + +@lru_cache(maxsize=None) +def project_metadata(key): + """Get project metadata project/attributes/-""" + return get_metadata(key, root=f"{ROOT_URL}/project/attributes") + + +def bucket_blob_download(bucket_name, blob_name): + from google.cloud import storage + + co = create_client_options("storage") + storage_client = storage.Client(client_options=co) + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_name) + contents = None + with tempfile.NamedTemporaryFile(mode="w+t") as tmp: + blob.download_to_filename(tmp.name) + with open(tmp.name, "r") as f: + contents = f.read() + return contents + + +def natural_sort(text): + def atoi(text): + return int(text) if text.isdigit() else text + + return [atoi(w) for w in re.split(r"(\d+)", text)] + + +# TODO: replace with to_hostlist_fast +def to_hostlist(nodenames) -> str: + """make hostlist from list of node names""" + # use tmp file because list could be large + tmp_file = tempfile.NamedTemporaryFile(mode="w+t", delete=False) + tmp_file.writelines("\n".join(sorted(nodenames, key=natural_sort))) + tmp_file.close() + + hostlist = run(f"{lkp.scontrol} show hostlist {tmp_file.name}").stdout.rstrip() + log_hostlists.debug(f"hostlist({len(nodenames)}): {hostlist}".format(hostlist)) + os.remove(tmp_file.name) + return hostlist + + +def to_hostlist_fast(names: Iterable[str]) -> str: + """ + Fast implementation of to_hostlist that doesn't invoke `scontrol` + IMPORTANT: + * Acts as `scontrol show hostlistsorted`, i.e. original order is not preserved + * Achieves worse compression than `to_hostlist` for some cases + """ + pref = defaultdict(list) + tokenizer = re.compile(r"^(.*?)(\d*)$") + for name in filter(None, names): + p, s = tokenizer.match(name).groups() + pref[p].append(s) + + def _compress_suffixes(ss: List[str]) -> List[str]: + cur, res = None, [] + + def cur_repr(): + nums, strs = cur + if nums[0] == nums[1]: + return strs[0] + return f"{strs[0]}-{strs[1]}" + + for s in sorted(ss, key=int): + n = int(s) + if cur is None: + cur = ((n, n), (s, s)) + continue + + nums, strs = cur + if n == nums[1] + 1: + cur = ((nums[0], n), (strs[0], s)) + else: + res.append(cur_repr()) + cur = ((n, n), (s, s)) + if cur: + res.append(cur_repr()) + return res + + res = [] + for p in sorted(pref.keys()): + sl = defaultdict(list) + for s in pref[p]: + sl[len(s)].append(s) + cs = [] + for ln in sorted(sl.keys()): + if ln == 0: + res.append(p) + else: + cs.extend(_compress_suffixes(sl[ln])) + if not cs: + continue + if len(cs) == 1 and "-" not in cs[0]: + res.append(f"{p}{cs[0]}") + else: + res.append(f"{p}[{','.join(cs)}]") + return ",".join(res) + + +def part_is_tpu(part): + """check if partition with name part contains a nodeset of type tpu""" + return len(lkp.cfg.partitions[part].partition_nodeset_tpu) > 0 + + +def get_vmcount_of_tpu_part(part): + res = 0 + for ns in lkp.cfg.partitions[part].partition_nodeset_tpu: + tpu_obj = TPU(lkp.cfg.nodeset_tpu[ns]) + if res == 0: + res = tpu_obj.vmcount + else: + if res != tpu_obj.vmcount: + # this should not happen, that in the same partition there are different vmcount nodesets + return -1 + return res + + +def to_hostnames(nodelist: str) -> List[str]: + """make list of hostnames from hostlist expression""" + if not nodelist: + return [] # avoid degenerate invocation of scontrol + if isinstance(nodelist, str): + hostlist = nodelist + else: + hostlist = ",".join(nodelist) + hostnames = run(f"{lkp.scontrol} show hostnames {hostlist}").stdout.splitlines() + log_hostlists.debug(f"hostnames({len(hostnames)}) from {hostlist}") + return hostnames + + +def retry_exception(exc): + """return true for exceptions that should always be retried""" + retry_errors = ( + "Rate Limit Exceeded", + "Quota Exceeded", + ) + return any(e in str(exc) for e in retry_errors) + + +def ensure_execute(request): + """Handle rate limits and socket time outs""" + + for retry, wait in enumerate(backoff_delay(0.5, timeout=10 * 60, count=20)): + try: + return request.execute() + except googleapiclient.errors.HttpError as e: + if retry_exception(e): + log.error(f"retry:{retry} '{e}'") + sleep(wait) + continue + raise + + except socket.timeout as e: + # socket timed out, try again + log.debug(e) + + except Exception as e: + log.error(e, exc_info=True) + raise + + break + + +def batch_execute(requests, retry_cb=None, log_err=log.error): + """execute list or dict as batch requests + retry if retry_cb returns true + """ + + compute = globals()["compute"] + BATCH_LIMIT = 1000 + if not isinstance(requests, dict): + requests = {str(k): v for k, v in enumerate(requests)} # rid generated here + done = {} + failed = {} + timestamps = [] + rate_limited = False + + def batch_callback(rid, resp, exc): + nonlocal rate_limited + if exc is not None: + log_err(f"compute request exception {rid}: {exc}") + if retry_exception(exc): + rate_limited = True + else: + req = requests.pop(rid) + failed[rid] = (req, exc) + else: + # if retry_cb is set, don't move to done until it returns false + if retry_cb is None or not retry_cb(resp): + requests.pop(rid) + done[rid] = resp + + def batch_request(reqs): + batch = compute.new_batch_http_request(callback=batch_callback) + for rid, req in reqs: + batch.add(req, request_id=rid) + return batch + + while requests: + if timestamps: + timestamps = [stamp for stamp in timestamps if stamp > time()] + if rate_limited and timestamps: + stamp = next(iter(timestamps)) + sleep(max(stamp - time(), 0)) + rate_limited = False + # up to API_REQ_LIMIT (2000) requests + # in chunks of up to BATCH_LIMIT (1000) + batches = [ + batch_request(chunk) + for chunk in chunked(islice(requests.items(), API_REQ_LIMIT), BATCH_LIMIT) + ] + timestamps.append(time() + 100) + with ThreadPoolExecutor() as exe: + futures = [] + for batch in batches: + future = exe.submit(ensure_execute, batch) + futures.append(future) + for future in futures: + result = future.exception() + if result is not None: + raise result + + return done, failed + + +def wait_request(operation, project=None, compute=None): + """makes the appropriate wait request for a given operation""" + if not compute: + compute = globals()["compute"] + if project is None: + project = lkp.project + if "zone" in operation: + req = compute.zoneOperations().wait( + project=project, + zone=trim_self_link(operation["zone"]), + operation=operation["name"], + ) + elif "region" in operation: + req = compute.regionOperations().wait( + project=project, + region=trim_self_link(operation["region"]), + operation=operation["name"], + ) + else: + req = compute.globalOperations().wait( + project=project, operation=operation["name"] + ) + return req + + +def wait_for_operation(operation, project=None, compute=None): + """wait for given operation""" + if not compute: + compute = globals()["compute"] + if project is None: + project = parse_self_link(operation["selfLink"]).project + wait_req = wait_request(operation, project=project, compute=compute) + + while True: + result = ensure_execute(wait_req) + if result["status"] == "DONE": + log_errors = " with errors" if "error" in result else "" + log.debug( + f"operation complete{log_errors}: type={result['operationType']}, name={result['name']}" + ) + return result + + +def wait_for_operations(operations, project=None, compute=None): + if not compute: + compute = globals()["compute"] + return [ + wait_for_operation(op, project=project, compute=compute) for op in operations + ] + + +def get_filtered_operations( + op_filter, + zone=None, + region=None, + only_global=False, + project=None, + compute=None, +): + """get list of operations associated with group id""" + + if not compute: + compute = globals()["compute"] + if project is None: + project = lkp.project + operations = [] + + def get_aggregated_operations(items): + # items is a dict of location key to value: dict(operations=) or an empty dict + operations.extend( + chain.from_iterable( + ops["operations"] for ops in items.values() if "operations" in ops + ) + ) + + def get_list_operations(items): + operations.extend(items) + + handle_items = get_list_operations + if only_global: + act = compute.globalOperations() + op = act.list(project=project, filter=op_filter) + nxt = act.list_next + elif zone is not None: + act = compute.zoneOperations() + op = act.list(project=project, zone=zone, filter=op_filter) + nxt = act.list_next + elif region is not None: + act = compute.regionOperations() + op = act.list(project=project, region=region, filter=op_filter) + nxt = act.list_next + else: + act = compute.globalOperations() + op = act.aggregatedList( + project=project, filter=op_filter, fields="items.*.operations,nextPageToken" + ) + nxt = act.aggregatedList_next + handle_items = get_aggregated_operations + while op is not None: + result = ensure_execute(op) + handle_items(result["items"]) + op = nxt(op, result) + return operations + + +def get_insert_operations(group_ids, flt=None, project=None, compute=None): + """get all insert operations from a list of operationGroupId""" + if not compute: + compute = globals()["compute"] + if project is None: + project = lkp.project + if isinstance(group_ids, str): + group_ids = group_ids.split(",") + filters = [ + "operationType=insert", + flt, + " OR ".join(f"(operationGroupId={id})" for id in group_ids), + ] + return get_filtered_operations(" AND ".join(f"({f})" for f in filters if f)) + + +def machine_type_sockets(template): + pattern = re.compile("^(?P[^-]+)") + m = pattern.match(template.machineType) + if not m: + raise Exception(f"template {template} does not match expected regex") + family = m.group("family") + guestCpus: int = int(template.machine_info.guestCpus) + socket_count = dict.get( + { + "h3": 2, + "c2d": 2 if guestCpus > 56 else 1, + "a3": 2, + }, + family, + 1, # assume 1 socket for all other families + ) + return socket_count + + +def isSmt(template): + machineType: str = template.machineType + guestCpus: int = int(template.machine_info.guestCpus) + + pattern = re.compile("^(?P[^-]+)") + matches = pattern.match(machineType) + machineTypeFamily: str = matches["family"] + + # https://cloud.google.com/compute/docs/cpu-platforms + noSmtFamily = [ + "t2a", + "t2d", + "h3", + ] + if machineTypeFamily in noSmtFamily: + return False + elif guestCpus == 1: + return False + return True + + +def getThreadsPerCore(template): + threadsPerCore: int = template.advancedMachineFeatures.threadsPerCore + + if not isSmt(template): + return 1 + elif threadsPerCore: + return threadsPerCore + else: + return 2 + + +@retry( + max_retries=9, + init_wait_time=1, + warn_msg="Temporary failure in name resolution", + exc_type=socket.gaierror, +) +def host_lookup(host_name: str) -> str: + return socket.gethostbyname(host_name) + + +class Dumper(yaml.SafeDumper): + """Add representers for pathlib.Path and NSDict for yaml serialization""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_representer(NSDict, self.represent_nsdict) + self.add_multi_representer(Path, self.represent_path) + + @staticmethod + def represent_nsdict(dumper, data): + return dumper.represent_mapping("tag:yaml.org,2002:map", data.items()) + + @staticmethod + def represent_path(dumper, path): + return dumper.represent_scalar("tag:yaml.org,2002:str", str(path)) + + +class TPU: + """Class for handling the TPU-vm nodes""" + + if can_tpu: + State = tpu.types.cloud_tpu.Node.State + TPUS_PER_VM = 4 + __expected_states = { + "create": State.READY, + "start": State.READY, + "stop": State.STOPPED, + } + + __tpu_version_mapping = { + "V2": tpu.AcceleratorConfig().Type.V2, + "V3": tpu.AcceleratorConfig().Type.V3, + "V4": tpu.AcceleratorConfig().Type.V4, + } + + def __init__(self, nodeset): + if not can_tpu: + raise Exception("TPU pip package not installed") + self._nodeset = nodeset + self._parent = f"projects/{lkp.project}/locations/{nodeset.zone}" + co = create_client_options(ApiEndpoint.TPU) + self._client = tpu.TpuClient(client_options=co) + self.data_disks = [] + for data_disk in nodeset.data_disks: + ad = tpu.AttachedDisk() + ad.source_disk = data_disk + ad.mode = tpu.AttachedDisk.DiskMode.DISK_MODE_UNSPECIFIED + self.data_disks.append(ad) + ns_ac = nodeset.accelerator_config + if ns_ac.topology != "" and ns_ac.version != "": + ac = tpu.AcceleratorConfig() + ac.topology = ns_ac.topology + ac.type_ = self.__tpu_version_mapping[ns_ac.version] + self.ac = ac + else: + req = tpu.GetAcceleratorTypeRequest( + name=f"{self._parent}/acceleratorTypes/{nodeset.node_type}" + ) + self.ac = self._client.get_accelerator_type(req).accelerator_configs[0] + self.vmcount = self.__calc_vm_from_topology(self.ac.topology) + + @property + def nodeset(self): + return self._nodeset + + @property + def preserve_tpu(self): + return self._nodeset.preserve_tpu + + @property + def node_type(self): + return self._nodeset.node_type + + @property + def tf_version(self): + return self._nodeset.tf_version + + @property + def enable_public_ip(self): + return self._nodeset.enable_public_ip + + @property + def preemptible(self): + return self._nodeset.preemptible + + @property + def reserved(self): + return self._nodeset.reserved + + @property + def service_account(self): + return self._nodeset.service_account + + @property + def zone(self): + return self._nodeset.zone + + def check_node_type(self): + if self.node_type is None: + return False + try: + request = tpu.GetAcceleratorTypeRequest( + name=f"{self._parent}/acceleratorTypes/{self.node_type}" + ) + return self._client.get_accelerator_type(request=request) is not None + except Exception: + return False + + def check_tf_version(self): + try: + request = tpu.GetRuntimeVersionRequest( + name=f"{self._parent}/runtimeVersions/{self.tf_version}" + ) + return self._client.get_runtime_version(request=request) is not None + except Exception: + return False + + def __calc_vm_from_topology(self, topology): + topo = topology.split("x") + tot = 1 + for num in topo: + tot = tot * int(num) + return tot // self.TPUS_PER_VM + + def __check_resp(self, response, op_name): + des_state = self.__expected_states.get(op_name) + # If the state is not in the table just print the response + if des_state is None: + return False + if response.__class__.__name__ != "Node": # If the response is not a node fail + return False + if response.state == des_state: + return True + return False + + def list_nodes(self): + try: + request = tpu.ListNodesRequest(parent=self._parent) + res = self._client.list_nodes(request=request) + except gExceptions.NotFound: + res = None + return res + + def list_node_names(self): + return [node.name.split("/")[-1] for node in self.list_nodes()] + + def start_node(self, nodename): + request = tpu.StartNodeRequest(name=f"{self._parent}/nodes/{nodename}") + resp = self._client.start_node(request=request).result() + return self.__check_resp(resp, "start") + + def stop_node(self, nodename): + request = tpu.StopNodeRequest(name=f"{self._parent}/nodes/{nodename}") + resp = self._client.stop_node(request=request).result() + return self.__check_resp(resp, "stop") + + def get_node(self, nodename): + try: + request = tpu.GetNodeRequest(name=f"{self._parent}/nodes/{nodename}") + res = self._client.get_node(request=request) + except gExceptions.NotFound: + res = None + return res + + def _register_node(self, nodename, ip_addr): + dns_name = socket.getnameinfo((ip_addr, 0), 0)[0] + run( + f"{lkp.scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}" + ) + + def create_node(self, nodename): + if self.vmcount > 1 and not isinstance(nodename, list): + log.error( + f"Tried to create a {self.vmcount} node TPU on nodeset {self._nodeset.nodeset_name} but only received one nodename {nodename}" + ) + return False + if self.vmcount > 1 and ( + isinstance(nodename, list) and len(nodename) != self.vmcount + ): + log.error( + f"Expected to receive a list of {self.vmcount} nodenames for TPU node creation in nodeset {self._nodeset.nodeset_name}, but received this list {nodename}" + ) + return False + + node = tpu.Node() + node.accelerator_config = self.ac + node.runtime_version = f"tpu-vm-tf-{self.tf_version}" + startup_script = """ + #!/bin/bash + echo "startup script not found > /var/log/startup_error.log" + """ + with open( + Path(cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" + ) as script: + startup_script = script.read() + if isinstance(nodename, list): + node_id = nodename[0] + slurm_names = [] + wid = 0 + for node_wid in nodename: + slurm_names.append(f"WORKER_{wid}:{node_wid}") + wid += 1 + else: + node_id = nodename + slurm_names = [f"WORKER_0:{nodename}"] + node.metadata = { + "slurm_docker_image": self.nodeset.docker_image, + "startup-script": startup_script, + "slurm_instance_role": "compute", + "slurm_cluster_name": lkp.cfg.slurm_cluster_name, + "slurm_bucket_path": lkp.cfg.bucket_path, + "slurm_names": ";".join(slurm_names), + } + node.tags = [lkp.cfg.slurm_cluster_name] + if self.nodeset.service_account: + node.service_account.email = self.nodeset.service_account.email + node.service_account.scope = self.nodeset.service_account.scopes + node.scheduling_config.preemptible = self.preemptible + node.scheduling_config.reserved = self.reserved + node.network_config.subnetwork = self.nodeset.subnetwork + node.network_config.enable_external_ips = self.enable_public_ip + if self.data_disks: + node.data_disks = self.data_disks + + request = tpu.CreateNodeRequest(parent=self._parent, node=node, node_id=node_id) + resp = self._client.create_node(request=request).result() + if not self.__check_resp(resp, "create"): + return False + if isinstance(nodename, list): + for node_id, net_endpoint in zip(nodename, resp.network_endpoints): + self._register_node(node_id, net_endpoint.ip_address) + else: + ip_add = resp.network_endpoints[0].ip_address + self._register_node(nodename, ip_add) + return True + + def delete_node(self, nodename): + request = tpu.DeleteNodeRequest(name=f"{self._parent}/nodes/{nodename}") + try: + resp = self._client.delete_node(request=request).result() + if resp: + return self.get_node(nodename=nodename) is None + return False + except gExceptions.NotFound: + # log only error if vmcount is 1 as for other tpu vm count, this could be "phantom" nodes + if self.vmcount == 1: + log.error(f"Tpu single node {nodename} not found") + else: + # for the TPU nodes that consist in more than one vm, only the first node of the TPU a.k.a. the master node will + # exist as real TPU nodes, so the other ones are expected to not be found, check the hostname of the node that has + # not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence + # log an error + nodehostname = yaml.safe_load( + run(f"{lkp.scontrol} --yaml show node {nodename}").stdout.rstrip() + )["nodes"][0]["hostname"] + if nodehostname.split("-")[-1] == "0": + log.error(f"TPU master node {nodename} not found") + else: + log.info(f"Deleted TPU 'phantom' node {nodename}") + # If the node is not found it is tecnichally deleted, so return success. + return True + + +class Lookup: + """Wrapper class for cached data access""" + + def __init__(self, cfg=None): + self._cfg = cfg or NSDict() + self.template_cache_path = Path(__file__).parent / "template_info.cache" + + @property + def cfg(self): + return self._cfg + + @property + def project(self): + return self.cfg.project or authentication_project() + + @property + def control_addr(self): + return self.cfg.slurm_control_addr + + @property + def control_host(self): + return self.cfg.slurm_control_host + + @cached_property + def control_host_addr(self): + return host_lookup(self.cfg.slurm_control_host) + + @property + def control_host_port(self): + return self.cfg.slurm_control_host_port + + @property + def endpoint_versions(self): + return self.cfg.endpoint_versions + + @property + def scontrol(self): + return Path(self.cfg.slurm_bin_dir if cfg else "") / "scontrol" + + @cached_property + def instance_role(self): + return instance_metadata("attributes/slurm_instance_role") + + @cached_property + def instance_role_safe(self): + try: + role = self.instance_role + except Exception as e: + log.error(e) + role = None + return role + + @cached_property + def compute(self): + # TODO evaluate when we need to use google_app_cred_path + if self.cfg.google_app_cred_path: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.cfg.google_app_cred_path + return compute_service() + + @cached_property + def hostname(self): + return socket.gethostname() + + @cached_property + def hostname_fqdn(self): + return socket.getfqdn() + + @cached_property + def zone(self): + return instance_metadata("zone") + + node_desc_regex = re.compile( + r"^(?P(?P[^\s\-]+)-(?P\S+))-(?P(?P\w+)|(?P\[[\d,-]+\]))$" + ) + + @lru_cache(maxsize=None) + def _node_desc(self, node_name): + """Get parts from node name""" + if not node_name: + node_name = self.hostname + # workaround below is for VMs whose hostname is FQDN + node_name_short = node_name.split(".")[0] + m = self.node_desc_regex.match(node_name_short) + if not m: + raise Exception(f"node name {node_name} is not valid") + return m.groupdict() + + def node_prefix(self, node_name=None): + return self._node_desc(node_name)["prefix"] + + def node_nodeset_name(self, node_name=None): + return self._node_desc(node_name)["nodeset"] + + def node_nodeset(self, node_name=None): + nodeset_name = self.node_nodeset_name(node_name) + ns = self.cfg.nodeset.get(nodeset_name) + if ns: + return ns + return self.cfg.nodeset_tpu.get(nodeset_name) + + def node_is_tpu(self, node_name=None): + nodeset_name = self.node_nodeset_name(node_name) + return self.cfg.nodeset_tpu.get(nodeset_name) is not None + + def node_is_dyn(self, node_name=None) -> bool: + nodeset = self.node_nodeset_name(node_name) + return self.cfg.nodeset_dyn.get(nodeset) is not None + + def chunk_tpu_nodes(self, tpu_nodes): + model = tpu_nodes[0] + tpu = TPU(self.node_nodeset(model)) + return chunked(tpu_nodes, n=tpu.vmcount) + + def node_template(self, node_name=None): + return self.node_nodeset(node_name).instance_template + + def node_template_info(self, node_name=None): + return self.template_info(self.node_template(node_name)) + + def node_region(self, node_name=None): + nodeset = self.node_nodeset(node_name) + return parse_self_link(nodeset.subnetwork).region + + def nodeset_prefix(self, nodeset_name): + return f"{self.cfg.slurm_cluster_name}-{nodeset_name}" + + def nodelist_range(self, nodeset_name: str, start: int, count: int) -> str: + assert 0 <= start and 0 < count + pref = self.nodeset_prefix(nodeset_name) + if count == 1: + return f"{pref}-{start}" + return f"{pref}-[{start}-{start + count - 1}]" + + def static_dynamic_sizes(self, nodeset: object) -> int: + return (nodeset.node_count_static or 0, nodeset.node_count_dynamic_max or 0) + + def nodelist(self, nodeset) -> str: + cnt = sum(self.static_dynamic_sizes(nodeset)) + if cnt == 0: + return "" + return self.nodelist_range(nodeset.nodeset_name, 0, cnt) + + def nodenames(self, nodeset) -> Tuple[Iterable[str], Iterable[str]]: + pref = self.nodeset_prefix(nodeset.nodeset_name) + s_count, d_count = self.static_dynamic_sizes(nodeset) + return ( + (f"{pref}-{i}" for i in range(s_count)), + (f"{pref}-{i}" for i in range(s_count, s_count + d_count)), + ) + + def power_managed_nodesets(self) -> Iterable[object]: + return chain(self.cfg.nodeset.values(), self.cfg.nodeset_tpu.values()) + + def is_power_managed_node(self, node_name: str) -> bool: + try: + ns = self.node_nodeset(node_name) + if ns is None: + return False + idx = int(self._node_desc(node_name)["suffix"]) + return idx < sum(self.static_dynamic_sizes(ns)) + except Exception: + return False + + def is_static_node(self, node_name: str) -> bool: + if not self.is_power_managed_node(node_name): + return False + idx = int(self._node_desc(node_name)["suffix"]) + return idx < self.node_nodeset(node_name).node_count_static + + @lru_cache(maxsize=None) + def slurm_nodes(self): + StateTuple = namedtuple("StateTuple", "base,flags") + + def make_node_tuple(node_line): + """turn node,state line to (node, StateTuple(state))""" + # state flags include: CLOUD, COMPLETING, DRAIN, FAIL, POWERED_DOWN, + # POWERING_DOWN + node, fullstate = node_line.split(",") + state = fullstate.split("+") + state_tuple = StateTuple(state[0], set(state[1:])) + return (node, state_tuple) + + cmd = ( + f"{self.scontrol} show nodes | " + r"grep -oP '^NodeName=\K(\S+)|\s+State=\K(\S+)' | " + r"paste -sd',\n'" + ) + node_lines = run(cmd, shell=True).stdout.rstrip().splitlines() + nodes = { + node: state + for node, state in map(make_node_tuple, node_lines) + if "CLOUD" in state.flags or "DYNAMIC_NORM" in state.flags + } + return nodes + + def slurm_node(self, nodename): + return self.slurm_nodes().get(nodename) + + @lru_cache(maxsize=1) + def instances(self, project=None, slurm_cluster_name=None): + slurm_cluster_name = slurm_cluster_name or self.cfg.slurm_cluster_name + project = project or self.project + instance_information_fields = [ + "advancedMachineFeatures", + "cpuPlatform", + "creationTimestamp", + "disks", + "disks", + "fingerprint", + "guestAccelerators", + "hostname", + "id", + "kind", + "labelFingerprint", + "labels", + "lastStartTimestamp", + "lastStopTimestamp", + "lastSuspendedTimestamp", + "machineType", + "metadata", + "name", + "networkInterfaces", + "resourceStatus", + "scheduling", + "selfLink", + "serviceAccounts", + "shieldedInstanceConfig", + "shieldedInstanceIntegrityPolicy", + "sourceMachineImage", + "status", + "statusMessage", + "tags", + "zone", + # "deletionProtection", + # "startRestricted", + ] + if lkp.cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.register_instance_information_fields( + lkp=lkp, + project=project, + slurm_cluster_name=slurm_cluster_name, + instance_information_fields=instance_information_fields, + ) + instance_information_fields = sorted(set(instance_information_fields)) + instance_fields = ",".join(instance_information_fields) + fields = f"items.zones.instances({instance_fields}),nextPageToken" + flt = f"labels.slurm_cluster_name={slurm_cluster_name} AND name:{slurm_cluster_name}-*" + act = self.compute.instances() + op = act.aggregatedList(project=project, fields=fields, filter=flt) + + def properties(inst): + """change instance properties to a preferred format""" + inst["zone"] = trim_self_link(inst["zone"]) + inst["machineType"] = trim_self_link(inst["machineType"]) + # metadata is fetched as a dict of dicts like: + # {'key': key, 'value': value}, kinda silly + metadata = {i["key"]: i["value"] for i in inst["metadata"].get("items", [])} + if "slurm_instance_role" not in metadata: + return None + inst["role"] = metadata["slurm_instance_role"] + inst["metadata"] = metadata + # del inst["metadata"] # no need to store all the metadata + return NSDict(inst) + + instances = {} + while op is not None: + result = ensure_execute(op) + instance_iter = ( + (inst["name"], properties(inst)) + for inst in chain.from_iterable( + m["instances"] for m in result.get("items", {}).values() + ) + ) + instances.update( + {name: props for name, props in instance_iter if props is not None} + ) + op = act.aggregatedList_next(op, result) + return instances + + def instance(self, instance_name, project=None, slurm_cluster_name=None): + instances = self.instances( + project=project, slurm_cluster_name=slurm_cluster_name + ) + return instances.get(instance_name) + + @lru_cache() + def reservation(self, name: str, zone: str) -> object: + """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations""" + try: + _, project, _, short_name = name.split("/") + except ValueError: + raise ValueError( + f"Invalid reservation name: '{name}', expected format is 'projects/PROJECT/reservations/NAME'" + ) + + return ( + self.compute.reservations() + .get(project=project, zone=zone, reservation=short_name) + .execute() + ) + + @lru_cache(maxsize=1) + def machine_types(self, project=None): + project = project or self.project + field_names = "name,zone,guestCpus,memoryMb,accelerators" + fields = f"items.zones.machineTypes({field_names}),nextPageToken" + + machines = defaultdict(dict) + act = self.compute.machineTypes() + op = act.aggregatedList(project=project, fields=fields) + while op is not None: + result = ensure_execute(op) + machine_iter = chain.from_iterable( + m["machineTypes"] + for m in result["items"].values() + if "machineTypes" in m + ) + for machine in machine_iter: + name = machine["name"] + zone = machine["zone"] + machines[name][zone] = machine + + op = act.aggregatedList_next(op, result) + return machines + + def machine_type(self, machine_type, project=None, zone=None): + """ """ + custom_patt = re.compile( + r"((?P\w+)-)?custom-(?P\d+)-(?P\d+)" + ) + custom_match = custom_patt.match(machine_type) + if zone: + project = project or self.project + machine_info = ensure_execute( + self.compute.machineTypes().get( + project=project, zone=zone, machineType=machine_type + ) + ) + elif custom_match is not None: + groups = custom_match.groupdict() + cpus, mem = (groups[k] for k in ["cpus", "mem"]) + machine_info = { + "guestCpus": int(cpus), + "memoryMb": int(mem), + } + else: + machines = self.machine_types(project=project) + machine_info = next(iter(machines[machine_type].values()), None) + if machine_info is None: + raise Exception(f"machine type {machine_type} not found") + return NSDict(machine_info) + + def template_machine_conf(self, template_link, project=None, zone=None): + template = self.template_info(template_link) + if not template.machineType: + temp_name = trim_self_link(template_link) + raise Exception(f"instance template {temp_name} has no machine type") + template.machine_info = self.machine_type(template.machineType, zone=zone) + machine = template.machine_info + + machine_conf = NSDict() + machine_conf.boards = 1 # No information, assume 1 + machine_conf.sockets = machine_type_sockets(template) + # the value below for SocketsPerBoard must be type int + machine_conf.sockets_per_board = machine_conf.sockets // machine_conf.boards + machine_conf.threads_per_core = 1 + _div = 2 if getThreadsPerCore(template) == 1 else 1 + machine_conf.cpus = ( + int(machine.guestCpus / _div) if isSmt(template) else machine.guestCpus + ) + machine_conf.cores_per_socket = int(machine_conf.cpus / machine_conf.sockets) + # Because the actual memory on the host will be different than + # what is configured (e.g. kernel will take it). From + # experiments, about 16 MB per GB are used (plus about 400 MB + # buffer for the first couple of GB's. Using 30 MB to be safe. + gb = machine.memoryMb // 1024 + machine_conf.memory = machine.memoryMb - (400 + (30 * gb)) + return machine_conf + + @contextmanager + def template_cache(self, writeback=False): + flag = "c" if writeback else "r" + err = None + for wait in backoff_delay(0.125, timeout=60, count=20): + try: + cache = shelve.open( + str(self.template_cache_path), flag=flag, writeback=writeback + ) + break + except OSError as e: + err = e + log.debug(f"Failed to access template info cache: {e}") + sleep(wait) + continue + else: + # reached max_count of waits + raise Exception(f"Failed to access cache file. latest error: {err}") + try: + yield cache + finally: + cache.close() + + @lru_cache(maxsize=None) + def template_info(self, template_link, project=None): + project = project or self.project + template_name = trim_self_link(template_link) + # split read and write access to minimize write-lock. This might be a + # bit slower? TODO measure + if self.template_cache_path.exists(): + with self.template_cache() as cache: + if template_name in cache: + return NSDict(cache[template_name]) + + template = ensure_execute( + self.compute.instanceTemplates().get( + project=project, instanceTemplate=template_name + ) + ).get("properties") + template = NSDict(template) + # name and link are not in properties, so stick them in + template.name = template_name + template.link = template_link + # TODO delete metadata to reduce memory footprint? + # del template.metadata + + # translate gpus into an easier-to-read format + machine_info = self.machine_type(template.machineType, project=project) + if machine_info.accelerators: + template.gpu_type = machine_info.accelerators[0].guestAcceleratorType + template.gpu_count = machine_info.accelerators[0].guestAcceleratorCount + elif template.guestAccelerators: + template.gpu_type = template.guestAccelerators[0].acceleratorType + template.gpu_count = template.guestAccelerators[0].acceleratorCount + else: + template.gpu_type = None + template.gpu_count = 0 + + # keep write access open for minimum time + with self.template_cache(writeback=True) as cache: + cache[template_name] = template.to_dict() + # cache should be owned by slurm + chown_slurm(self.template_cache_path) + + return template + + def nodeset_map(self, hostnames: list): + """Convert a list of nodes into a map of nodeset_name to hostnames""" + nodeset_map = collections.defaultdict(list) + for node in hostnames: + nodeset_map[self.node_nodeset_name(node)].append(node) + return nodeset_map + + +# Define late globals +lkp = Lookup() +cfg = load_config_file(CONFIG_FILE) +if not cfg: + try: + cfg = fetch_config_yaml() + except Exception as e: + log.warning(f"config not found in bucket: {e}") + if cfg: + save_config(cfg, CONFIG_FILE) + +lkp = Lookup(cfg) + +# Needs to be run after the lookup is complete to get endpoint versions +compute = compute_service() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--partitions", + "-p", + help="The partition(s) to retrieve the TPU vmcount value for.", + ) + args = parser.parse_args() + if args.partitions: + # useful exit code + # partition does not exists in config.yaml, thus do not exist in slurm + PART_INVALID = -1 + # in the same partition there are nodesets with different vmcounts + DIFF_VMCOUNTS_SAME_PART = -2 + # partition is a list of partitions in which at least two of them have different vmcount + DIFF_PART_DIFFERENT_VMCOUNTS = -3 + vmcounts = [] + # valid equals to 0 means that we are ok, otherwise it will be set to one of the previously defined exit codes + valid = 0 + for part in args.partitions.split(","): + if part not in lkp.cfg.partitions: + valid = PART_INVALID + break + else: + if part_is_tpu(part): + vmcount = get_vmcount_of_tpu_part(part) + if vmcount == -1: + valid = DIFF_VMCOUNTS_SAME_PART + break + vmcounts.append(vmcount) + else: + vmcounts.append(0) + # this means that there are different vmcounts for these partitions + if valid == 0 and len(set(vmcounts)) != 1: + valid = DIFF_PART_DIFFERENT_VMCOUNTS + if valid != 0: + print(f"VMCOUNT:{valid}") + else: + print(f"VMCOUNT:{vmcounts[0]}") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf new file mode 100644 index 0000000000..1cffbb307f --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -0,0 +1,467 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "bucket_name" { + description = <<-EOD + Name of GCS bucket to use. + EOD + type = string +} + +variable "bucket_dir" { + description = "Bucket directory for cluster files to be put into." + type = string + default = null +} + +variable "enable_devel" { + type = bool + description = "Enables development mode. Not for production use." + default = false +} + +variable "enable_debug_logging" { + type = bool + description = "Enables debug logging mode. Not for production use." + default = false +} + +variable "extra_logging_flags" { + type = map(bool) + description = "The list of extra flags for the logging system to use. See the logging_flags variable in scripts/util.py to get the list of supported log flags." + default = {} +} + +variable "project_id" { + description = "The GCP project ID." + type = string +} + +######### +# SLURM # +######### + +variable "slurm_cluster_name" { + type = string + description = "The cluster name, used for resource naming and slurm accounting." + + validation { + condition = can(regex("^[a-z](?:[a-z0-9]{0,9})$", var.slurm_cluster_name)) + error_message = "Variable 'slurm_cluster_name' must be a match of regex '^[a-z](?:[a-z0-9]{0,9})$'." + } +} + +variable "enable_slurm_gcp_plugins" { + description = < Date: Tue, 2 Jul 2024 23:12:46 +0000 Subject: [PATCH 017/118] Fix pre-commit, add `/etc` --- .../schedmd-slurm-gcp-v6-controller/README.md | 16 +- .../modules/slurm_files/README.md | 317 +++++++----------- .../modules/slurm_files/README_TF.md | 119 ------- .../modules/slurm_files/etc/cgroup.conf.tpl | 7 + .../slurm_files/etc/job_submit.lua.tpl | 102 ++++++ .../modules/slurm_files/etc/slurm.conf.tpl | 67 ++++ .../modules/slurm_files/etc/slurmdbd.conf.tpl | 31 ++ .../modules/slurm_files/main.tf | 6 +- .../modules/slurm_files/scripts/load_bq.py | 14 + .../scripts/slurm_gcp_plugins/README.md | 2 +- .../scripts/slurm_gcp_plugins/__init__.py | 14 + .../slurm_gcp_plugins/max_hops/__init__.py | 14 + .../slurm_gcp_plugins/test_plugin/__init__.py | 14 + .../slurm_gcp_plugins/utils/__init__.py | 14 + .../modules/slurm_files/scripts/startup.sh | 197 ----------- .../scripts/tests/test_topology.py | 14 + .../slurm_files/scripts/tests/test_util.py | 14 + .../modules/slurm_files/variables.tf | 18 +- .../slurm_files.tf | 2 +- 19 files changed, 444 insertions(+), 538 deletions(-) delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/cgroup.conf.tpl create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurm.conf.tpl create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurmdbd.conf.tpl delete mode 100755 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index de0621b9b1..7e9a5ee96e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,14 +196,14 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.13 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.13 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.13 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | +| [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.9 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.9 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 12a8861937..7599f28af9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -1,200 +1,117 @@ -# Module: Slurm Cluster - -[FAQ](../../docs/faq.md) | [Troubleshooting](../../docs/troubleshooting.md) | -[Glossary](../../docs/glossary.md) - - - -- [Module: Slurm Cluster](#module-slurm-cluster) - - [Overview](#overview) - - [Usage](#usage) - - [Dependencies](#dependencies) - - [Software](#software) - - [Required](#required) - - [Optional](#optional) - - [TerraformUser](#terraformuser) - - [Required](#required-1) - - [Optional](#optional-1) - - [Controller SA](#controller-sa) - - [Required](#required-2) - - [Optional](#optional-2) - - [Compute SA](#compute-sa) - - [Optional](#optional-3) - - [Login SA](#login-sa) - - [Optional](#optional-4) - - [Module API](#module-api) - - - -## Overview - -This module creates a [Slurm](../../docs/glossary.md#slurm) cluster on -[GCP](../../docs/glossary.md#gcp). There are two modes of operation: cloud; and -hybrid. Cloud mode will create a VM controller. Hybrid mode will generate -`cloud.conf` and `cloud_gres.conf` files to be included in the on-prem -configuration files, while managing a `config.yaml` file for internal module -use. - -Partitions define what compute resources are available to the controller so it -may allocate jobs. Slurm will resume/create compute instances as needed to run -allocated jobs and will suspend/terminate the instances after they are no longer -needed (e.g. IDLE for SuspendTimeout duration). Static nodes are persistent; -they are exempt from being suspended/terminated under normal conditions. Dynamic -nodes are burstable; they will scale up and down with workload. - -> **WARNING:** Destroying the controller before it has suspended/terminated all -> static and dynamic node instances and supporting resources (e.g. placement -> groups, subscription) will leave those resources orphaned unless cleanup -> options are enabled (.e.g `enable_cleanup_compute`, -> `enable_cleanup_subscriptions`). - -## Usage - -See [examples](./examples/slurm_cluster/) directory for sample usages. - -See below for a simple inclusion within your own terraform project. - -```hcl -module "slurm_cluster" { - source = "git@github.com:SchedMD/slurm-gcp.git//terraform/slurm_cluster?ref=v5.0.0" - - project_id = "" - - slurm_cluster_name = "" - - # ... omitted ... -} -``` - -> **NOTE:** Because this module is not hosted on -> [Terraform Registry](../../docs/glossary.md#terraform-registry), the version -> must be strictly controlled via -> [revision](https://www.terraform.io/language/modules/sources#selecting-a-revision) -> syntax on the source line. - -## Dependencies - -### Software - -Certain software must be installed on the local machine or APIs enabled in -[GCP](../../docs/glossary.md#gcp) for -[TerraformUser](../../docs/glossary.md#terraformuser) to be able to use this -module. - -#### Required - -- [Terraform](https://www.terraform.io/downloads.html) is installed. -- [GCP Cloud SDK](https://cloud.google.com/sdk/downloads) is installed. -- [Compute Engine API](../../docs/glossary.md#compute-engine) is enabled. - -#### Optional - -- [Python](../../docs/glossary.md#python) is installed. - - Required Version: `>= 3.6.0, < 4.0.0` - - Required when any of: - - `enable_hybrid=true` - - `enable_cleanup_compute=true` - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` -- [Pip](../../../docs/glossary.md#pip) packages are installed. - - Required when any of: - - `enable_hybrid=true` - - `enable_cleanup_compute=true` - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` - - `pip3 install -r ../../scripts/requirements.txt --user` -- [Private Google Access](../../docs/glossary.md#private-google-access) is - enabled. - - Required when any instances only have internal IPs. -- [Secret Manager API](../../docs/glossary.md#secret-manager) is enabled. - - Required when `cloudsql != null`. -- [Pub/Sub API](../../docs/glossary.md#pubsub) is enabled. - - Required when any of: - - `enable_cleanup_subscriptions=true` - - `enable_reconfigure=true` -- [Bigquery API](../../docs/glossary.md#bigquery) is enabled. - - Required when `enable_bigquery_load=true`. - -### TerraformUser - -[TerraformUser](../../docs/glossary.md#terraformuser) authenticates with -credentials to [Google Cloud](../../docs/glossary.md#gcp). It is recommended to -create a principal [IAM](../../docs/glossary.md#iam) for this user and associate -[roles](../../docs/glossary.md#iam-roles) to them. Optionally, the TerraformUser -can operate through a [service account](../../docs/glossary.md#service-account). - -#### Required - -- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) - -#### Optional - -- Pub/Sub Admin (`roles/pubsub.admin`) - - Required when `enable_reconfigure=true`. -- Secret Manager Admin (`roles/secretmanager.admin`) - - Required when `cloudsql != null`. -- Service Account User (`roles/iam.serviceAccountUser`) - - Required when [TerraformUser](../../docs/glossary.md#terraformuser) is using - an [service account](../../docs/glossary.md#service-account) to - authenticate. - -### Controller SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the controller -[instance template](../../docs/glossary.md#instance-template) for -[slurm_controller_instance](../slurm_controller_instance/). - -#### Required - -- Compute Instance Admin (v1) (`roles/compute.instanceAdmin.v1`) -- Compute Instance Admin (beta) (`roles/compute.instanceAdmin`) -- Service Account User (`roles/iam.serviceAccountUser`) - -#### Optional - -- BigQuery Data Editor (`roles/bigquery.dataEditor`) - - Required when `enable_bigquery_load=true`. -- Cloud SQL Editor (`roles/cloudsql.editor`) - - Required when all of: - - `cloudsql != null` - - Communicating to CloudSQL instance -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. -- Pub/Sub Admin (`roles/pubsub.admin`) - - Required when `enable_reconfigure=true`. - -### Compute SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the compute -[instance templates](../../docs/glossary.md#instance-template) created by -[slurm_partition](../slurm_partition/). - -#### Optional - -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. - -### Login SA - -[Service account](../../docs/glossary.md#service-account) intended to be -associated with the login -[instance templates](../../docs/glossary.md#instance-template) created by -[slurm_partition](../slurm_partition/). - -#### Optional - -- Logs Writer (`roles/logging.logWriter`) - - Recommended. -- Monitoring Metric Writer (`roles/monitoring.metricWriter`) - - Recommended. - -## Module API - -For the terraform module API reference, please see -[README_TF.md](./README_TF.md). + +Copyright (C) SchedMD LLC. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | ~> 1.3 | +| [archive](#requirement\_archive) | ~> 2.0 | +| [google](#requirement\_google) | >= 3.53 | +| [local](#requirement\_local) | ~> 2.0 | +| [random](#requirement\_random) | ~> 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [archive](#provider\_archive) | ~> 2.0 | +| [google](#provider\_google) | >= 3.53 | +| [local](#provider\_local) | ~> 2.0 | +| [random](#provider\_random) | ~> 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_storage_bucket_object.compute_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.controller_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.devel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | +| [archive_file.slurm_gcp_devel_zip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | +| [google_storage_bucket.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/storage_bucket) | data source | +| [local_file.external_epilog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | +| [local_file.external_prolog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | +| [local_file.setup_external](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | +| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | +| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | +| [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_startup\_scripts](#input\_controller\_startup\_scripts) | List of scripts to be ran on controller VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | +| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | +| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | +| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | +| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/v5/tools/prologs-epilogs/README.md | `bool` | `false` | no | +| [enable\_hybrid](#input\_enable\_hybrid) | Enables use of hybrid controller mode. When true, controller\_hybrid\_config will
be used instead of controller\_instance\_config and will disable login instances. | `bool` | `false` | no | +| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": null
}
| no | +| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | +| [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | +| [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | +| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | +| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | +| [network\_storage](#input\_network\_storage) | Storage to mounted on all instances.
- server\_ip : Address of the storage server.
- remote\_mount : The location in the remote instance filesystem to mount from.
- local\_mount : The location on the instance filesystem to mount to.
- fs\_type : Filesystem type (e.g. "nfs").
- mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Cluster nodenets, as a list. | `list(any)` | `[]` | no | +| [nodeset\_dyn](#input\_nodeset\_dyn) | Cluster nodenets (dynamic), as a list. | `list(any)` | `[]` | no | +| [nodeset\_startup\_scripts](#input\_nodeset\_startup\_scripts) | List of scripts to be ran on compute VM startup in the specific nodeset. |
map(list(object({
filename = string
content = string
})))
| `{}` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Cluster nodenets (TPU), as a list. | `list(any)` | `[]` | no | +| [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:
cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py. | `string` | `null` | no | +| [partitions](#input\_partitions) | Cluster partitions as a list. | `list(any)` | `[]` | no | +| [project\_id](#input\_project\_id) | The GCP project ID. | `string` | n/a | yes | +| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | The cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | +| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | +| [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | +| [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | +| [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `"6818"` | no | +| [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | +| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | +| [config](#output\_config) | Cluster configuration. | +| [nodeset](#output\_nodeset) | Cluster nodesets. | +| [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | +| [nodeset\_tpu](#output\_nodeset\_tpu) | Cluster nodesets (TPU). | +| [partitions](#output\_partitions) | Cluster partitions. | +| [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md deleted file mode 100644 index f30666aec0..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README_TF.md +++ /dev/null @@ -1,119 +0,0 @@ -# bucket_files - - -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | ~> 1.3 | -| [archive](#requirement\_archive) | ~> 2.0 | -| [google](#requirement\_google) | >= 3.53 | -| [local](#requirement\_local) | ~> 2.0 | -| [random](#requirement\_random) | ~> 3.0 | - -## Providers - -| Name | Version | -|------|---------| -| [archive](#provider\_archive) | ~> 2.0 | -| [google](#provider\_google) | >= 3.53 | -| [local](#provider\_local) | ~> 2.0 | -| [random](#provider\_random) | ~> 3.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [google_storage_bucket_object.compute_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.controller_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.devel](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.epilog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.login_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.nodeset_startup_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.prolog_scripts](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [random_uuid.cluster_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/uuid) | resource | -| [archive_file.slurm_gcp_devel_zip](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | -| [google_storage_bucket.this](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/storage_bucket) | data source | -| [local_file.external_epilog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.external_prolog](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | -| [local_file.setup_external](https://registry.terraform.io/providers/hashicorp/local/latest/docs/data-sources/file) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | -| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | -| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | -| [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | -| [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [controller\_startup\_scripts](#input\_controller\_startup\_scripts) | List of scripts to be ran on controller VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
* /usr/local/etc/slurm
* /etc/munge
* /home
* /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | -| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | -| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/v5/tools/prologs-epilogs/README.md | `bool` | `false` | no | -| [enable\_hybrid](#input\_enable\_hybrid) | Enables use of hybrid controller mode. When true, controller\_hybrid\_config will
be used instead of controller\_instance\_config and will disable login instances. | `bool` | `false` | no | -| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | -| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": null
}
| no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | -| [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | -| [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | -| [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | -| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | -| [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | -| [network\_storage](#input\_network\_storage) | Storage to mounted on all instances.
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Cluster nodenets, as a list. | `list(any)` | `[]` | no | -| [nodeset\_dyn](#input\_nodeset\_dyn) | Cluster nodenets (dynamic), as a list. | `list(any)` | `[]` | no | -| [nodeset\_startup\_scripts](#input\_nodeset\_startup\_scripts) | List of scripts to be ran on compute VM startup in the specific nodeset. |
map(list(object({
filename = string
content = string
})))
| `{}` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Cluster nodenets (TPU), as a list. | `list(any)` | `[]` | no | -| [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:
cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py. | `string` | `null` | no | -| [partitions](#input\_partitions) | Cluster partitions as a list. | `list(any)` | `[]` | no | -| [project\_id](#input\_project\_id) | The GCP project ID. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | The cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | -| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | -| [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | -| [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").

This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | -| [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.

See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `"6818"` | no | -| [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | -| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | -| [config](#output\_config) | Cluster configuration. | -| [nodeset](#output\_nodeset) | Cluster nodesets. | -| [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | -| [nodeset\_tpu](#output\_nodeset\_tpu) | Cluster nodesets (TPU). | -| [partitions](#output\_partitions) | Cluster partitions. | -| [slurm\_bucket\_path](#output\_slurm\_bucket\_path) | GCS Bucket URI of Slurm cluster file storage. | - diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/cgroup.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/cgroup.conf.tpl new file mode 100644 index 0000000000..ffeb167cfc --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/cgroup.conf.tpl @@ -0,0 +1,7 @@ +# cgroup.conf +# https://slurm.schedmd.com/cgroup.conf.html + +ConstrainCores=yes +ConstrainRamSpace=yes +ConstrainSwapSpace=no +ConstrainDevices=yes diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl new file mode 100644 index 0000000000..5cf8ddb7e9 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl @@ -0,0 +1,102 @@ +SCRIPTS_DIR = "{scripts_dir}" +NO_VAL = 4294967294 +--util.py exit code +PART_INVALID = -1 --partition does not exists in config.yaml, thus do not exist in slurm +DIFF_VMCOUNTS_SAME_PART = -2 --in the same partition there are nodesets with different vmcounts +DIFF_PART_DIFFERENT_VMCOUNTS = -3 --partition is a list of partitions in which at least two of them have different vmcount +UNKWOWN_ERROR = -4 --util.py did not return a valid response + +function get_part(job_desc,part_list) + if job_desc.partition then + return job_desc.partition + end + for name,val in pairs(part_list) do + if val.flag_default == 1 then + return name + end + end + return nil +end + +function os.capture(cmd, raw) + local handle = assert(io.popen(cmd, 'r')) + local output = assert(handle:read('*a')) + handle:close() + return output +end + +function get_vmcount(part) + local cmd = SCRIPTS_DIR .. "/util.py -p " .. part + local out = os.capture(cmd,true) + for line in out:gmatch("(.-)\r?\n") do + local tag, val = line:match("([^:]+):([^:]+)") + if tag == "VMCOUNT" then + return tonumber(val) + end + end + return UNKWOWN_ERROR +end + + +function slurm_job_submit(job_desc, part_list, submit_uid) + local part = get_part(job_desc,part_list) + local vmcount = get_vmcount(part) + --Only do something if the job is in a TPU partition, if vmcount is 0, it implies that the partition(s) specified are not TPU ones + if vmcount == 0 then + return slurm.SUCCESS + end + --This is a TPU job, but as the vmcount is 1 it can he handled the same way + if vmcount == 1 then + return slurm.SUCCESS + end + --Check for errors + if vmcount == PART_INVALID then + slurm.log_user("Invalid partition specified " .. part) + return slurm.FAILURE + end + if vmcount == DIFF_VMCOUNTS_SAME_PART then + slurm.log_user("In partition(s) " .. part .. " there are more than one tpu nodeset vmcount, this should not happen.") + return slurm.ERROR + end + if vmcount == DIFF_PART_DIFFERENT_VMCOUNTS then + slurm.log_user("In partition list " .. part .. " there are more than one TPU types, cannot determine which is the correct vmcount to use, please retry with only one partition.") + return slurm.FAILURE + end + if vmcount == UNKWOWN_ERROR then + slurm.log_user("Something went wrong while executing util.py to get the vmcount.") + return slurm.ERROR + end + --This is surely a TPU node + if vmcount > 1 then + local min_nodes = job_desc.min_nodes + local max_nodes = job_desc.max_nodes + --if not specified assume it is one, this should be improved taking into account the cpus, mem, and other factors + if min_nodes == NO_VAL then + min_nodes = 1 + max_nodes = 1 + end + --as max_nodes can be higher than the nodes in the partition, we are not able to calculate with certainty the nodes that this job will have if this value is set to something + --different than min_nodes + if min_nodes ~= max_nodes then + slurm.log_user("Max nodes cannot be set different than min nodes for the TPU partitions.") + return slurm.ERROR + end + --Set the number of switches to the number of nodes originally requested by the job, as the job requests "TPU groups" + job_desc.req_switch = min_nodes + + --Apply the node increase into the job description. + job_desc.min_nodes = min_nodes * vmcount + job_desc.max_nodes = max_nodes * vmcount + --if job_desc.features then + --slurm.log_user("Features: %s",job_desc.features) + --end + end + + return slurm.SUCCESS +end + +function slurm_job_modify(job_desc, job_rec, part_list, modify_uid) + return slurm.SUCCESS +end + +return slurm.SUCCESS diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurm.conf.tpl new file mode 100644 index 0000000000..7d32bed8fc --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurm.conf.tpl @@ -0,0 +1,67 @@ +# slurm.conf +# https://slurm.schedmd.com/slurm.conf.html +# https://slurm.schedmd.com/configurator.html + +ProctrackType=proctrack/cgroup +SlurmctldPidFile=/var/run/slurm/slurmctld.pid +SlurmdPidFile=/var/run/slurm/slurmd.pid +TaskPlugin=task/affinity,task/cgroup +MaxNodeCount=64000 + +# +# +# SCHEDULING +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core_Memory + +# +# +# LOGGING AND ACCOUNTING +AccountingStoreFlags=job_comment +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/cgroup +SlurmctldDebug=info +SlurmdDebug=info +DebugFlags=Power + +# +# +# TIMERS +MessageTimeout=60 + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +SlurmctldHost={control_host}({control_addr}) + +AuthType=auth/munge +AuthInfo=cred_expire=120 +AuthAltTypes=auth/jwt +CredType=cred/munge +MpiDefault={mpi_default} +ReturnToService=2 +SlurmctldPort={control_host_port} +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd +SlurmUser=slurm +StateSaveLocation={state_save} + +# +# +# LOGGING AND ACCOUNTING +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost={control_host} +ClusterName={name} +SlurmctldLogFile={slurmlog}/slurmctld.log +SlurmdLogFile={slurmlog}/slurmd-%n.log + +# +# +# GENERATED CLOUD CONFIGURATIONS +include cloud.conf + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurmdbd.conf.tpl new file mode 100644 index 0000000000..ba06f28bf3 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/slurmdbd.conf.tpl @@ -0,0 +1,31 @@ +# slurmdbd.conf +# https://slurm.schedmd.com/slurmdbd.conf.html + +DebugLevel=info +PidFile=/var/run/slurm/slurmdbd.pid + +################################################################################ +# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # +################################################################################ + +AuthType=auth/munge +AuthAltTypes=auth/jwt +AuthAltParameters=jwt_key={state_save}/jwt_hs256.key + +DbdHost={control_host} + +LogFile={slurmlog}/slurmdbd.log + +SlurmUser=slurm + +StorageLoc={db_name} + +StorageType=accounting_storage/mysql +StorageHost={db_host} +StoragePort={db_port} +StorageUser={db_user} +StoragePass={db_pass} + +################################################################################ +# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # +################################################################################ diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 896c17dc36..8963baab17 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -15,7 +15,7 @@ */ locals { - scripts_dir = abspath("${path.module}/../../../../scripts") + scripts_dir = abspath("${path.module}/scripts") bucket_dir = coalesce(var.bucket_dir, format("%s-files", var.slurm_cluster_name)) } @@ -106,7 +106,7 @@ locals { x_nodeset_tpu = toset([for k, v in local.nodeset_tpu : v.nodeset_name]) x_nodeset_overlap = setintersection([], local.x_nodeset, local.x_nodeset_dyn, local.x_nodeset_tpu) - etc_dir = abspath("${path.module}/../../../../etc") + etc_dir = abspath("${path.module}/etc") bucket_path = format("%s/%s", data.google_storage_bucket.this.url, local.bucket_dir) @@ -138,7 +138,7 @@ resource "google_storage_bucket_object" "config" { ######### locals { - build_dir = abspath("${path.module}/../../../../build") + build_dir = abspath("${path.module}/build") slurm_gcp_devel_zip = "slurm-gcp-devel.zip" slurm_gcp_devel_zip_bucket = format("%s/%s", local.bucket_dir, local.slurm_gcp_devel_zip) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index 70dfa04d81..f876827a4c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -1,4 +1,18 @@ #!/usr/bin/env python3 +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import os diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md index 7c73936327..57664beb14 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/README.md @@ -32,7 +32,7 @@ def post_main_resume_nodes(*pos_args, **keyword_args): and extract arguments from `keyword_args`. Check the callback sites to understand which values that are available. -### Current callback sites: +### Current callback sites Callbacks are currently performed from the following places: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py index a4f11079b1..c56793c4be 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/__init__.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import importlib import pkgutil import logging diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py index 6505f8f47e..6e1f8dfae7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/max_hops/__init__.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging import sys import slurm_gcp_plugins.utils as sgp_utils diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py index deb53f7aa9..67dbd5d408 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/test_plugin/__init__.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import logging instance_information_fields = ["resourceStatus", "id"] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py index 6977fb5c93..d24e38aa25 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurm_gcp_plugins/utils/__init__.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import subprocess import logging diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh deleted file mode 100755 index a5ee3bc413..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/startup.sh +++ /dev/null @@ -1,197 +0,0 @@ -#!/bin/bash -# Copyright (C) SchedMD LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -SLURM_DIR=/slurm -FLAGFILE=$SLURM_DIR/slurm_configured_do_not_remove -SCRIPTS_DIR=$SLURM_DIR/scripts -if [[ -z "$HOME" ]]; then - # google-startup-scripts.service lacks environment variables - HOME="$(getent passwd "$(whoami)" | cut -d: -f6)" -fi - -METADATA_SERVER="metadata.google.internal" -URL="http://$METADATA_SERVER/computeMetadata/v1" -HEADER="Metadata-Flavor:Google" -CURL="curl -sS --fail --header $HEADER" -UNIVERSE_DOMAIN="$($CURL $URL/instance/attributes/universe_domain)" -STORAGE_CMD="CLOUDSDK_CORE_UNIVERSE_DOMAIN=$UNIVERSE_DOMAIN gcloud storage" - -function devel::zip() { - local BUCKET="$($CURL $URL/instance/attributes/slurm_bucket_path)" - if [[ -z $BUCKET ]]; then - echo "ERROR: No bucket path detected." - return 1 - fi - - local SLURM_ZIP_URL="$BUCKET/slurm-gcp-devel.zip" - local SLURM_ZIP_FILE="$HOME/slurm-gcp-devel.zip" - local SLURM_ZIP_DIR="$HOME/slurm-gcp-devel" - eval $(bash -c "$STORAGE_CMD cp $SLURM_ZIP_URL $SLURM_ZIP_FILE") - if ! [[ -f "$SLURM_ZIP_FILE" ]]; then - echo "INFO: No development files downloaded. Skipping." - return 0 - fi - unzip -o "$SLURM_ZIP_FILE" -d "$SCRIPTS_DIR" - rm -rf "$SLURM_ZIP_FILE" "$SLURM_ZIP_DIR" # Clean up - echo "INFO: Finished inflating '$SLURM_ZIP_FILE'." - - #temporary hack to not make the script fail on TPU vm - chown slurm:slurm -R "$SCRIPTS_DIR" || true - chmod 700 -R "$SCRIPTS_DIR" - echo "INFO: Updated permissions of files in '$SCRIPTS_DIR'." -} - -function config() { - local BUCKET="$($CURL $URL/instance/attributes/slurm_bucket_path)" - if [[ -z $BUCKET ]]; then - echo "ERROR: No bucket path detected." - return 1 - fi - - local SLURM_CONFIG_URL="$BUCKET/config.yaml" - local SLURM_CONFIG_FILE="$SCRIPTS_DIR/config.yaml" - eval $(bash -c "$STORAGE_CMD cp $SLURM_CONFIG_URL $SLURM_CONFIG_FILE") - if ! [[ -f "$SLURM_CONFIG_FILE" ]]; then - echo "INFO: No config file downloaded. Skipping." - return 0 - fi - - #temporary hack to not make the script fail on TPU vm - chown slurm:slurm -R "$SLURM_CONFIG_FILE" || true - chmod 600 -R "$SLURM_CONFIG_FILE" - echo "INFO: Updated permissions of '$SLURM_CONFIG_FILE'." -} - -PING_METADATA="ping -q -w1 -c1 $METADATA_SERVER" -echo "INFO: $PING_METADATA" -for i in $(seq 10); do - [ $i -gt 1 ] && sleep 5; - $PING_METADATA > /dev/null && s=0 && break || s=$?; - echo "ERROR: Failed to contact metadata server, will retry" -done -if [ $s -ne 0 ]; then - echo "ERROR: Unable to contact metadata server, aborting" - wall -n '*** Slurm setup failed in the startup script! see `journalctl -u google-startup-scripts` ***' - exit 1 -else - echo "INFO: Successfully contacted metadata server" -fi - -GOOGLE_DNS=8.8.8.8 -PING_GOOGLE="ping -q -w1 -c1 $GOOGLE_DNS" -echo "INFO: $PING_GOOGLE" -for i in $(seq 5); do - [ $i -gt 1 ] && sleep 2; - $PING_GOOGLE > /dev/null && s=0 && break || s=$?; - echo "failed to ping Google DNS, will retry" -done -if [ $s -ne 0 ]; then - echo "WARNING: No internet access detected" -else - echo "INFO: Internet access detected" -fi - -mkdir -p $SCRIPTS_DIR - -SETUP_SCRIPT_FILE=$SCRIPTS_DIR/setup.py -UTIL_SCRIPT_FILE=$SCRIPTS_DIR/util.py - -devel::zip -config - -if [ -f $FLAGFILE ]; then - echo "WARNING: Slurm was previously configured, quitting" - exit 0 -fi -touch $FLAGFILE - -function tpu_setup { - #allow the following command to fail, as this attribute does not exist for regular nodes - docker_image=$($CURL $URL/instance/attributes/slurm_docker_image 2> /dev/null || true) - if [ -z $docker_image ]; then #Not a tpu node, do not do anything - return - fi - if [ "$OS_ENV" == "slurm_container" ]; then #Already inside the slurm container, we should continue starting - return - fi - - #given a input_string like "WORKER_0:Joseph;WORKER_1:richard;WORKER_2:edward;WORKER_3:john" and a number 1, this function will print richard - parse_metadata() { - local number=$1 - local input_string=$2 - local word=$(echo "$input_string" | awk -v n="$number" -F ':|;' '{ for (i = 1; i <= NF; i+=2) if ($(i) == "WORKER_"n) print $(i+1) }') - echo "$word" - } - - input_string=$($CURL $URL/instance/attributes/slurm_names) - worker_id=$($CURL $URL/instance/attributes/tpu-env | awk '/WORKER_ID/ {print $2}' | tr -d \') - real_name=$(parse_metadata $worker_id $input_string) - - #Prepare to docker pull with gcloud - mkdir -p /root/.docker - cat << EOF > /root/.docker/config.json -{ - "credHelpers": { - "gcr.io": "gcloud", - "us-docker.pkg.dev": "gcloud" - } -} -EOF - #cgroup detection - CGV=1 - CGROUP_FLAGS="-v /sys/fs/cgroup:/sys/fs/cgroup:rw" - if [ -f /sys/fs/cgroup/cgroup.controllers ]; then #CGV2 - CGV=2 - fi - if [ $CGV == 2 ]; then - CGROUP_FLAGS="--cgroup-parent=docker.slice --cgroupns=private --tmpfs /run --tmpfs /run/lock --tmpfs /tmp" - if [ ! -f /etc/systemd/system/docker.slice ]; then #In case that there is no slice prepared for hosting the containers create it - printf "[Unit]\nDescription=docker slice\nBefore=slices.target\n[Slice]\nCPUAccounting=true\nMemoryAccounting=true" > /etc/systemd/system/docker.slice - systemctl start docker.slice - fi - fi - #for the moment always use --privileged, as systemd might not work properly otherwise - TPU_FLAGS="--privileged" - # TPU_FLAGS="--cap-add SYS_RESOURCE --device /dev/accel0 --device /dev/accel1 --device /dev/accel2 --device /dev/accel3" - # if [ $CGV == 2 ]; then #In case that we are in CGV2 for systemd to work correctly for the moment we go with privileged - # TPU_FLAGS="--privileged" - # fi - - docker run -d $CGROUP_FLAGS $TPU_FLAGS --net=host --name=slurmd --hostname=$real_name --entrypoint=/usr/bin/systemd --restart unless-stopped $docker_image - exit 0 -} - -tpu_setup #will do nothing for normal nodes or the container spawned inside TPU - -function fetch_feature { - if slurmd_feature="$($CURL $URL/instance/attributes/slurmd_feature)"; then - echo "$slurmd_feature" - else - echo "" - fi -} -SLURMD_FEATURE="$(fetch_feature)" - -echo "INFO: Running python cluster setup script" -chmod +x $SETUP_SCRIPT_FILE -python3 $SCRIPTS_DIR/util.py -if [[ -n "$SLURMD_FEATURE" ]]; then - echo "INFO: Running dynamic node setup." - exec $SETUP_SCRIPT_FILE --slurmd-feature="$SLURMD_FEATURE" -else - exec $SETUP_SCRIPT_FILE -fi diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index fc1f249cf0..0b114e03e7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Optional import mock import sys diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 9c3a03c210..c1c9c7182d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -1,3 +1,17 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import pytest diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 1cffbb307f..9872f8f5d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -244,10 +244,10 @@ EOD variable "disable_default_mounts" { description = <<-EOD Disable default global network storage from the controller - * /usr/local/etc/slurm - * /etc/munge - * /home - * /apps + - /usr/local/etc/slurm + - /etc/munge + - /home + - /apps If these are disabled, the slurm etc and munge dirs must be added manually, or some other mechanism must be used to synchronize the slurm conf files and the munge key across the cluster. @@ -259,11 +259,11 @@ variable "disable_default_mounts" { variable "network_storage" { description = < Date: Tue, 9 Jul 2024 18:11:31 +0000 Subject: [PATCH 018/118] Sync scripts changes --- .../modules/slurm_files/scripts/slurmsync.py | 2 +- .../modules/slurm_files/scripts/util.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 9104be7ea3..53af894c32 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -155,7 +155,7 @@ def _find_tpu_node_status(nodename, state): & state.flags ): return NodeStatus.unbacked - if nodename in find_node_status.static_nodeset: + if lkp.is_static_node(nodename): return NodeStatus.resume elif ( state is not None diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index c77bcd9932..831748a201 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -175,8 +175,14 @@ def authentication_project(): return google.auth.default()[1] +DEFAULT_UNIVERSE_DOMAIN = "googleapis.com" + + def universe_domain() -> str: - return instance_metadata("attributes/universe_domain") + try: + return instance_metadata("attributes/universe_domain") + except Exception: + return DEFAULT_UNIVERSE_DOMAIN def endpoint_version(api: ApiEndpoint) -> Optional[str]: @@ -204,6 +210,8 @@ def create_client_options(api: ApiEndpoint = None) -> ClientOptions: ep = None ver = endpoint_version(api) ud = universe_domain() + if ud == DEFAULT_UNIVERSE_DOMAIN: + ud = None if ver: ep = f"https://{api.value}.{ud}/{ver}/" log.debug( @@ -469,7 +477,7 @@ def build_request(http, *args, **kwargs): disc_url = googleapiclient.discovery.DISCOVERY_URI if ver: version = ver - disc_url = disc_url.replace("googleapis.com", universe_domain()) + disc_url = disc_url.replace(DEFAULT_UNIVERSE_DOMAIN, universe_domain()) log.debug(f"Using version={version} of Google Compute Engine API") return googleapiclient.discovery.build( @@ -1519,6 +1527,7 @@ def create_node(self, nodename): "slurm_cluster_name": lkp.cfg.slurm_cluster_name, "slurm_bucket_path": lkp.cfg.bucket_path, "slurm_names": ";".join(slurm_names), + "universe_domain": universe_domain(), } node.tags = [lkp.cfg.slurm_cluster_name] if self.nodeset.service_account: From 97cdcc534bba6c4253ee2be542d8d09c18aef1ac Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Sat, 13 Jul 2024 00:55:27 +0000 Subject: [PATCH 019/118] fix integration test --- tools/cloud-build/daily-tests/builds/gke.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 6ef10c5859..709a2b5c1b 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -47,7 +47,7 @@ steps: echo ' zone: us-central1-a' >> $${SG_EXAMPLE} echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} - echo ' source: compute/gke-node-pool' >> $${SG_EXAMPLE} + echo ' source: modules/compute/gke-node-pool' >> $${SG_EXAMPLE} echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} From 518628402a977b2ba9602be6354d6948e5809120 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Sat, 13 Jul 2024 01:11:02 +0000 Subject: [PATCH 020/118] fix test --- modules/scheduler/pre-existing-gke-cluster/README.md | 2 +- tools/cloud-build/daily-tests/builds/ml-gke.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 73a75b1117..e4a40c2315 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -24,7 +24,7 @@ GKE node pool will be created. region: us-central1 - id: compute_pool - source: community/modules/compute/gke-node-pool + source: modules/compute/gke-node-pool use: [existing-gke-cluster] ``` diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index 83da847769..4336929f16 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -47,7 +47,7 @@ steps: echo ' zone: asia-southeast1-b' >> $${SG_EXAMPLE} echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} - echo ' source: community/modules/compute/gke-node-pool' >> $${SG_EXAMPLE} + echo ' source: modules/compute/gke-node-pool' >> $${SG_EXAMPLE} echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} From 5e6f65914e2aa9c5b0740cb1fc8ed9ca63c58dc9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 12 Jul 2024 23:58:27 +0000 Subject: [PATCH 021/118] Sync latest scripts --- .../schedmd-slurm-gcp-v6-controller/README.md | 14 +++++------ .../modules/slurm_files/scripts/util.py | 23 ++++++------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 7e9a5ee96e..53d1ad8d41 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,14 +196,14 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.9 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.9 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.9 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.9 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.13 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.13 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 831748a201..e2d9c7103c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -207,25 +207,16 @@ def get_credentials() -> Optional[service_account.Credentials]: def create_client_options(api: ApiEndpoint = None) -> ClientOptions: """Create client options for cloud endpoints""" - ep = None ver = endpoint_version(api) ud = universe_domain() - if ud == DEFAULT_UNIVERSE_DOMAIN: - ud = None + options = {} + if ud and ud != DEFAULT_UNIVERSE_DOMAIN: + options["universe_domain"] = ud if ver: - ep = f"https://{api.value}.{ud}/{ver}/" - log.debug( - f"Using universe domain: {ud}. " - + ( - f"For API: {api.value} using API endpoint: " f"{ep if ep else 'default'}" - if api - else "" - ) - ) - return ClientOptions( - universe_domain=ud, - api_endpoint=ep, - ) + options["api_endpoint"] = f"https://{api.value}.{ud}/{ver}/" + co = ClientOptions(**options) + log.debug(f"Using ClientOptions = {co} for API: {api.value}") + return co class LogFormatter(logging.Formatter): From 2a48c8ff9914395b7530c5884e4468485dfcd024 Mon Sep 17 00:00:00 2001 From: Tom Kelly Date: Mon, 15 Jul 2024 09:23:14 +0100 Subject: [PATCH 022/118] Pass 'spot' and 'termination_action' partition variables to the instance template in schedmd-slurm-gcp-v6-controller --- .../scheduler/schedmd-slurm-gcp-v6-controller/partition.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 11de8a21cd..c51188859b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -53,6 +53,8 @@ module "slurm_nodeset_template" { name_prefix = each.value.nodeset_name on_host_maintenance = each.value.on_host_maintenance preemptible = each.value.preemptible + spot = each.value.spot + termination_action = each.value.termination_action service_account = each.value.service_account shielded_instance_config = each.value.shielded_instance_config source_image_family = each.value.source_image_family From 0c42c80989e30d9ff74a6ce03540838613662b7e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:42:26 +0000 Subject: [PATCH 023/118] Bump google.golang.org/api from 0.187.0 to 0.188.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.187.0 to 0.188.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.187.0...v0.188.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 20 ++++++++++---------- go.sum | 44 ++++++++++++++++++++++---------------------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/go.mod b/go.mod index b2c7a02410..735db691a7 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect + google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,11 +27,11 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.187.0 + google.golang.org/api v0.188.0 ) require ( - cloud.google.com/go/auth v0.6.1 // indirect + cloud.google.com/go/auth v0.7.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect @@ -54,14 +54,14 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b // indirect ) require ( cloud.google.com/go v0.115.0 // indirect - cloud.google.com/go/compute/metadata v0.3.0 // indirect - cloud.google.com/go/iam v1.1.8 // indirect + cloud.google.com/go/compute/metadata v0.4.0 // indirect + cloud.google.com/go/iam v1.1.10 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect github.com/agext/levenshtein v1.2.3 @@ -95,12 +95,12 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.24.0 // indirect - golang.org/x/net v0.26.0 // indirect + golang.org/x/crypto v0.25.0 // indirect + golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.22.0 golang.org/x/text v0.16.0 // indirect - google.golang.org/grpc v1.64.0 // indirect + google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 4860b8756b..78bd6dffb1 100644 --- a/go.sum +++ b/go.sum @@ -46,8 +46,8 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38= -cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4= +cloud.google.com/go/auth v0.7.0 h1:kf/x9B3WTbBUHkC+1VS8wwwli9TzhSt0vSTVBmMR8Ts= +cloud.google.com/go/auth v0.7.0/go.mod h1:D+WqdrpcjmiCgWrXmLLxOVq1GACoE36chW6KXoEvuIw= cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -72,8 +72,8 @@ cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= -cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= -cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +cloud.google.com/go/compute/metadata v0.4.0 h1:vHzJCWaM4g8XIcm8kopr3XmDA4Gy/lblD3EhhSux05c= +cloud.google.com/go/compute/metadata v0.4.0/go.mod h1:SIQh1Kkb4ZJ8zJ874fqVkslA29PRXuleyj6vOzlbK7M= cloud.google.com/go/containeranalysis v0.5.1/go.mod h1:1D92jd8gRR/c0fGMlymRgxWD3Qw9C1ff6/T7mLgVL8I= cloud.google.com/go/containeranalysis v0.6.0/go.mod h1:HEJoiEIu+lEXM+k7+qLCci0h33lX3ZqoYFdmPcoO7s4= cloud.google.com/go/datacatalog v1.3.0/go.mod h1:g9svFY6tuR+j+hrTw3J2dNcmI0dzmSiyOzm8kpLq0a0= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= -cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= +cloud.google.com/go/iam v1.1.10 h1:ZSAr64oEhQSClwBL670MsJAW5/RLiC6kfw3Bqmd5ZDI= +cloud.google.com/go/iam v1.1.10/go.mod h1:iEgMq62sg8zx446GCaijmA2Miwg5o3UbO+nI47WHJps= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -529,8 +529,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -619,8 +619,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -737,8 +737,8 @@ golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo= -google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk= +google.golang.org/api v0.188.0 h1:51y8fJ/b1AaaBRJr4yWm96fPcuxSo0JcegXE3DaHQHw= +google.golang.org/api v0.188.0/go.mod h1:VR0d+2SIiWOYG3r/jdm7adPW9hI2aRv9ETOSCQ9Beag= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -978,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b h1:dSTjko30weBaMj3eERKc0ZVXW4GudCswM3m+P++ukU0= +google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b/go.mod h1:FfBgJBJg9GcpPvKIuHSZ/aE1g2ecGL74upMzGZjiGEY= +google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 h1:0+ozOGcrp+Y8Aq8TLNN2Aliibms5LEzsq99ZZmAGYm0= +google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094/go.mod h1:fJ/e3If/Q67Mj99hin0hMhiNyCRmt6BQ2aWIJshUSJw= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b h1:04+jVzTs2XBnOZcPsLnmrTGqltqJbZQ1Ey26hjYdQQ0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1019,8 +1019,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= -google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= +google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From d66dd80f1abfb1f086bb3c5707097d2329fb5577 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Tue, 2 Jul 2024 20:15:38 +0000 Subject: [PATCH 024/118] Merge configs from blueprint guest_accelerator with standard guest_accelerator * Updated node_pool README to reflect the guest_accelerator changes for gpu families --- .../modules/compute/gke-node-pool/README.md | 52 +++++++++++-------- .../modules/compute/gke-node-pool/main.tf | 10 ++-- .../compute/gke-node-pool/variables.tf | 18 +++---- examples/README.md | 19 +++---- 4 files changed, 51 insertions(+), 48 deletions(-) diff --git a/community/modules/compute/gke-node-pool/README.md b/community/modules/compute/gke-node-pool/README.md index fa2b31a761..153e53f186 100644 --- a/community/modules/compute/gke-node-pool/README.md +++ b/community/modules/compute/gke-node-pool/README.md @@ -78,8 +78,8 @@ kernel modules to be loaded. There are several ways to add GPUs to a GKE node pool. See [docs](https://cloud.google.com/compute/docs/gpus) for more info on GPUs. -The following is a node pool that uses `a2` or `g2` machine types which has a -fixed number of attached GPUs: +The following is a node pool that uses `a2`, `a3` or `g2` machine types which has a +fixed number of attached GPUs, let's call these machine types as "pre-defined gpu machine families": ```yaml - id: simple-a2-pool @@ -90,8 +90,10 @@ fixed number of attached GPUs: ``` > **Note**: It is not necessary to define the [`guest_accelerator`] setting when -> using `a2` or `g2` machines as information about GPUs, such as type and count, -> is automatically inferred from the machine type. +> using pre-defined gpu machine families as information about GPUs, such as type, count and +> `gpu_driver_installation_config`, is automatically inferred from the machine type. +> Optional fields such as `gpu_partition_size` need to be specified only if they have +> non-default values. The following scenarios require the [`guest_accelerator`] block is specified: @@ -103,6 +105,8 @@ The following is an example of [partitioning](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus-multi) an A100 GPU: +> **Note**: In the following example, `type`, `count` and `gpu_driver_installation_config` are picked up automatically. + ```yaml - id: multi-instance-gpu-pool source: community/modules/compute/gke-node-pool @@ -110,16 +114,9 @@ an A100 GPU: settings: machine_type: a2-highgpu-1g guest_accelerator: - - type: nvidia-tesla-a100 - count: 1 - gpu_partition_size: 1g.5gb - gpu_sharing_config: null - gpu_driver_installation_config: null + - gpu_partition_size: 1g.5gb ``` -> **Note**: Once we define the [`guest_accelerator`] block, all fields must be -> defined. Use `null` for optional fields. - [`guest_accelerator`]: https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_guest_accelerator The following is an example of @@ -133,16 +130,13 @@ The following is an example of settings: machine_type: a2-highgpu-1g guest_accelerator: - - type: nvidia-tesla-a100 - count: 1 - gpu_partition_size: 1g.5gb + - gpu_partition_size: 1g.5gb gpu_sharing_config: - gpu_sharing_strategy: TIME_SHARING max_shared_clients_per_gpu: 3 - gpu_driver_installation_config: null ``` -Finally, the following is an example of using a GPU attached to an `n1` machine: +Following is an example of using a GPU attached to an `n1` machine: ```yaml - id: t4-pool @@ -153,9 +147,25 @@ Finally, the following is an example of using a GPU attached to an `n1` machine: guest_accelerator: - type: nvidia-tesla-t4 count: 2 - gpu_partition_size: null - gpu_sharing_config: null - gpu_driver_installation_config: null +``` + +Finally, the following is an example of using a GPU (with sharing config) attached to an `n1` machine: + +```yaml + - id: n1-t4-pool + source: community/modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: n1-t4-pool + machine_type: n1-standard-1 + guest_accelerator: + - type: nvidia-tesla-t4 + count: 2 + gpu_driver_installation_config: + - gpu_driver_version: "LATEST" + gpu_sharing_config: + - max_shared_clients_per_gpu: 2 + gpu_sharing_strategy: "TIME_SHARING" ``` ## License @@ -220,7 +230,7 @@ No modules. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `"pd-standard"` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string
count = number
gpu_driver_installation_config = list(object({
gpu_driver_version = string
}))
gpu_partition_size = string
gpu_sharing_config = list(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/community/modules/compute/gke-node-pool/main.tf b/community/modules/compute/gke-node-pool/main.tf index e8bb2f7145..551ba1f5a5 100644 --- a/community/modules/compute/gke-node-pool/main.tf +++ b/community/modules/compute/gke-node-pool/main.tf @@ -87,11 +87,11 @@ resource "google_container_node_pool" "node_pool" { dynamic "guest_accelerator" { for_each = local.guest_accelerator content { - type = guest_accelerator.value.type - count = guest_accelerator.value.count - gpu_driver_installation_config = try(guest_accelerator.value.gpu_driver_installation_config, [{ gpu_driver_version = "DEFAULT" }]) - gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, null) - gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, null) + type = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, "")) + count = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0")) + gpu_driver_installation_config = coalescelist(try(guest_accelerator.value.gpu_driver_installation_config, []), [{ gpu_driver_version = "DEFAULT" }]) + gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, "") + gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, []) } } diff --git a/community/modules/compute/gke-node-pool/variables.tf b/community/modules/compute/gke-node-pool/variables.tf index 0940d09c38..1d71ba8fa2 100644 --- a/community/modules/compute/gke-node-pool/variables.tf +++ b/community/modules/compute/gke-node-pool/variables.tf @@ -69,16 +69,16 @@ variable "enable_secure_boot" { variable "guest_accelerator" { description = "List of the type and count of accelerator cards attached to the instance." type = list(object({ - type = string - count = number - gpu_driver_installation_config = list(object({ + type = optional(string) + count = optional(number, 0) + gpu_driver_installation_config = optional(list(object({ gpu_driver_version = string - })) - gpu_partition_size = string - gpu_sharing_config = list(object({ - gpu_sharing_strategy = string - max_shared_clients_per_gpu = number - })) + }))) + gpu_partition_size = optional(string) + gpu_sharing_config = optional(list(object({ + gpu_sharing_strategy = optional(string) + max_shared_clients_per_gpu = optional(number) + }))) })) default = null } diff --git a/examples/README.md b/examples/README.md index 6a946e5061..dc03422af8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1399,32 +1399,26 @@ Toolkit. It includes: Users only need to provide machine type for standard ["a2", "a3" and "g2"] machine families, while the other settings like `type`, `count` , `gpu_driver_installation_config` will default to - machine family specific values. - However, for other standard or custom machine families users will need to provide - the entire configuration as follows: + machine family specific values. More on this [gke-node-pool](../community/modules/compute/gke-node-pool/README.md#gpus-examples) ```yaml machine_type: n1-standard-1 guest_accelerator: - type: nvidia-tesla-t4 count: 1 - gpu_partition_size: null - gpu_sharing_config: null - gpu_driver_installation_config: - - gpu_driver_version: "DEFAULT" ``` -Custom g2 pool +Custom g2 pool with custom `guest_accelerator` config ```yaml machine_type: g2-custom-16-55296 +disk_type: pd-balanced guest_accelerator: - type: nvidia-l4 count: 1 - gpu_partition_size: null gpu_sharing_config: - - max_shared_clients_per_gpu: 2 - gpu_sharing_strategy: "TIME_SHARING" + - max_shared_clients_per_gpu: 2 + gpu_sharing_strategy: "TIME_SHARING" gpu_driver_installation_config: - gpu_driver_version: "LATEST" ``` @@ -1434,8 +1428,7 @@ guest_accelerator: GPU node pool. > **Note**: The Kubernetes API server will only allow requests from authorized -> networks. Nvidia drivers are installed on GPU nodes by a DaemonSet created by -> the [`kubernetes-operations`] Terraform module. **You must use the +> networks. **You must use the > `authorized_cidr` variable to supply an authorized network which contains the > IP address of the machine deploying the blueprint, for example > `--vars authorized_cidr=/32`.** This will allow Terraform to From a2634c7e4a716d62ec2397dcbc99ff6a4bef09ee Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 16 Jul 2024 22:01:07 +0000 Subject: [PATCH 025/118] Minor clean up of `slurm_files.bucket_dir` Make it non-optional. --- .../modules/slurm_files/README.md | 2 +- .../modules/slurm_files/main.tf | 20 +++++++++---------- .../modules/slurm_files/variables.tf | 1 - 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 7599f28af9..8620cf0cb5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -59,7 +59,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | +| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | n/a | yes | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 8963baab17..c3ba3cc4ab 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -16,8 +16,6 @@ locals { scripts_dir = abspath("${path.module}/scripts") - - bucket_dir = coalesce(var.bucket_dir, format("%s-files", var.slurm_cluster_name)) } ######## @@ -93,7 +91,7 @@ locals { } config_yaml = "config.yaml" - config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) + config_yaml_bucket = format("%s/%s", var.bucket_dir, local.config_yaml) partitions = { for p in var.partitions[*].partition : p.partition_name => p } @@ -108,7 +106,7 @@ locals { etc_dir = abspath("${path.module}/etc") - bucket_path = format("%s/%s", data.google_storage_bucket.this.url, local.bucket_dir) + bucket_path = format("%s/%s", data.google_storage_bucket.this.url, var.bucket_dir) slurm_control_host_port = coalesce(var.slurm_control_host_port, "6818") @@ -141,7 +139,7 @@ locals { build_dir = abspath("${path.module}/build") slurm_gcp_devel_zip = "slurm-gcp-devel.zip" - slurm_gcp_devel_zip_bucket = format("%s/%s", local.bucket_dir, local.slurm_gcp_devel_zip) + slurm_gcp_devel_zip_bucket = format("%s/%s", var.bucket_dir, local.slurm_gcp_devel_zip) } data "archive_file" "slurm_gcp_devel_zip" { @@ -182,7 +180,7 @@ resource "google_storage_bucket_object" "controller_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-controller-script-%s", local.bucket_dir, each.key) + name = format("%s/slurm-controller-script-%s", var.bucket_dir, each.key) content = each.value.content } @@ -193,7 +191,7 @@ resource "google_storage_bucket_object" "compute_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-compute-script-%s", local.bucket_dir, each.key) + name = format("%s/slurm-compute-script-%s", var.bucket_dir, each.key) content = each.value.content } @@ -207,7 +205,7 @@ resource "google_storage_bucket_object" "nodeset_startup_scripts" { ]]) : x.name => x.content } bucket = var.bucket_name - name = format("%s/%s", local.bucket_dir, each.key) + name = format("%s/%s", var.bucket_dir, each.key) content = each.value } @@ -218,7 +216,7 @@ resource "google_storage_bucket_object" "login_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-login-script-%s", local.bucket_dir, each.key) + name = format("%s/slurm-login-script-%s", var.bucket_dir, each.key) content = each.value.content } @@ -229,7 +227,7 @@ resource "google_storage_bucket_object" "prolog_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-prolog-script-%s", local.bucket_dir, each.key) + name = format("%s/slurm-prolog-script-%s", var.bucket_dir, each.key) content = each.value.content source = each.value.source } @@ -241,7 +239,7 @@ resource "google_storage_bucket_object" "epilog_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-epilog-script-%s", local.bucket_dir, each.key) + name = format("%s/slurm-epilog-script-%s", var.bucket_dir, each.key) content = each.value.content source = each.value.source } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 9872f8f5d6..1983e07a0b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -24,7 +24,6 @@ variable "bucket_name" { variable "bucket_dir" { description = "Bucket directory for cluster files to be put into." type = string - default = null } variable "enable_devel" { From ce20a65ec6cad7c4a598f7dca5fed992073feee3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 17 Jul 2024 00:19:24 +0000 Subject: [PATCH 026/118] Fix slurm script tests * Remove global `compute`; * Remove unused optional args; * Fix "NPE" in `create_client_options`; * Refactor `test_create_client_options`. --- .../modules/slurm_files/scripts/resume.py | 5 +- .../modules/slurm_files/scripts/slurmsync.py | 13 ++- .../modules/slurm_files/scripts/suspend.py | 11 +-- .../slurm_files/scripts/tests/test_util.py | 39 ++------ .../modules/slurm_files/scripts/util.py | 96 +++++-------------- 5 files changed, 43 insertions(+), 121 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index f515d52c8a..d03217a29f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -44,7 +44,6 @@ ) from util import cfg, lkp, NSDict, TPU -# from util import cfg, lkp, NSDict import slurm_gcp_plugins @@ -214,7 +213,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None request_body=body, ) - request = util.compute.regionInstances().bulkInsert( + request = lkp.compute.regionInstances().bulkInsert( project=cfg.project, region=region, body=body.to_dict() ) @@ -529,7 +528,7 @@ def create_placement_request(pg_name, region): slurm_gcp_plugins.pre_placement_group_insert( lkp=lkp, pg_name=pg_name, region=region, request_body=config ) - request = util.compute.resourcePolicies().insert( + request = lkp.compute.resourcePolicies().insert( project=cfg.project, region=region, body=config ) log_api_request(request) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 53af894c32..e06e2c2edb 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -45,7 +45,7 @@ TPU, chunked, ) -from util import lkp, cfg, compute, CONFIG_FILE +from util import lkp, cfg, CONFIG_FILE from suspend import delete_instances from resume import start_tpu from conf import ( @@ -83,10 +83,9 @@ ) -def start_instance_op(inst, project=None): - project = project or lkp.project - return compute.instances().start( - project=project, +def start_instance_op(inst): + return lkp.compute.instances().start( + project=lkp.project, zone=lkp.instance(inst).zone, instance=inst, ) @@ -334,7 +333,7 @@ def nodes_unknown(): def delete_placement_groups(placement_groups): def delete_placement_request(pg_name, region): - return compute.resourcePolicies().delete( + return lkp.compute.resourcePolicies().delete( project=lkp.project, region=region, resourcePolicy=pg_name ) @@ -384,7 +383,7 @@ def sync_placement_groups(): fields = "items.regions.resourcePolicies,nextPageToken" flt = f"name={lkp.cfg.slurm_cluster_name}-*" - act = compute.resourcePolicies() + act = lkp.compute.resourcePolicies() op = act.aggregatedList(project=lkp.project, fields=fields, filter=flt) placement_groups = {} pg_regex = re.compile( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index af70d97679..0acae42e38 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -31,7 +31,7 @@ separate, execute_with_futures, ) -from util import lkp, cfg, compute, TPU +from util import lkp, cfg, TPU import slurm_gcp_plugins @@ -52,11 +52,10 @@ def truncate_iter(iterable, max_count): yield el -def delete_instance_request(instance, project=None, zone=None): - project = project or lkp.project - request = compute.instances().delete( - project=project, - zone=(zone or lkp.instance(instance).zone), +def delete_instance_request(instance): + request = lkp.compute.instances().delete( + project=lkp.project, + zone=lkp.instance(instance).zone, instance=instance, ) log_api_request(request) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index c1c9c7182d..5e11c3e65f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -102,43 +102,29 @@ def test_to_hostlist_fast(names, expected): ( util.ApiEndpoint.BQ, "v1", - ClientOptions( - api_endpoint="https://bq.googleapis.com/v1/", - universe_domain="googleapis.com", - ), + ClientOptions(api_endpoint="https://bq.googleapis.com/v1/"), ), ( util.ApiEndpoint.COMPUTE, "staging_v1", - ClientOptions( - api_endpoint="https://compute.googleapis.com/staging_v1/", - universe_domain="googleapis.com", - ), + ClientOptions(api_endpoint="https://compute.googleapis.com/staging_v1/"), ), ( util.ApiEndpoint.SECRET, "v1", - ClientOptions( - api_endpoint="https://secret_manager.googleapis.com/v1/", - universe_domain="googleapis.com", - ), + ClientOptions(api_endpoint="https://secret_manager.googleapis.com/v1/"), ), ( util.ApiEndpoint.STORAGE, "beta", - ClientOptions( - api_endpoint="https://storage.googleapis.com/beta/", - universe_domain="googleapis.com", - ), + ClientOptions(api_endpoint="https://storage.googleapis.com/beta/"), ), ( util.ApiEndpoint.TPU, "alpha", - ClientOptions( - api_endpoint="https://tpu.googleapis.com/alpha/", - universe_domain="googleapis.com", - ), + ClientOptions(api_endpoint="https://tpu.googleapis.com/alpha/"), ), + (None, None, ClientOptions()), ], ) def test_create_client_options( @@ -148,15 +134,4 @@ def test_create_client_options( ep_mock = mocker.patch("util.endpoint_version") ud_mock.return_value = "googleapis.com" ep_mock.return_value = ep_ver - co = util.create_client_options(api) - assert ( - co.api_endpoint == expected.api_endpoint - and co.universe_domain == expected.universe_domain - ) - ud_mock.return_value = None - ep_mock.return_value = None - co = util.create_client_options(api) - assert ( - co.api_endpoint != expected.api_endpoint - and co.universe_domain != expected.universe_domain - ) + assert util.create_client_options(api).__repr__() == expected.__repr__() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index e2d9c7103c..8a65532fc9 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -112,8 +112,6 @@ def mkdirp(path: Path) -> None: p for p in (Path(__file__).parent, Path("/slurm/scripts")) if p.is_dir() ) -# readily available compute api handle -compute = None # slurm-gcp config object, could be empty if not available cfg = NSDict() # caching Lookup object @@ -205,17 +203,17 @@ def get_credentials() -> Optional[service_account.Credentials]: return credentials -def create_client_options(api: ApiEndpoint = None) -> ClientOptions: +def create_client_options(api: Optional[ApiEndpoint] = None) -> ClientOptions: """Create client options for cloud endpoints""" ver = endpoint_version(api) ud = universe_domain() options = {} if ud and ud != DEFAULT_UNIVERSE_DOMAIN: options["universe_domain"] = ud - if ver: + if api and ver: options["api_endpoint"] = f"https://{api.value}.{ud}/{ver}/" co = ClientOptions(**options) - log.debug(f"Using ClientOptions = {co} for API: {api.value}") + log.debug(f"Using ClientOptions = {co} for API: {api}") return co @@ -1066,8 +1064,6 @@ def batch_execute(requests, retry_cb=None, log_err=log.error): """execute list or dict as batch requests retry if retry_cb returns true """ - - compute = globals()["compute"] BATCH_LIMIT = 1000 if not isinstance(requests, dict): requests = {str(k): v for k, v in enumerate(requests)} # rid generated here @@ -1092,7 +1088,7 @@ def batch_callback(rid, resp, exc): done[rid] = resp def batch_request(reqs): - batch = compute.new_batch_http_request(callback=batch_callback) + batch = lkp.compute.new_batch_http_request(callback=batch_callback) for rid, req in reqs: batch.add(req, request_id=rid) return batch @@ -1124,38 +1120,31 @@ def batch_request(reqs): return done, failed -def wait_request(operation, project=None, compute=None): +def wait_request(operation, project: str): """makes the appropriate wait request for a given operation""" - if not compute: - compute = globals()["compute"] - if project is None: - project = lkp.project if "zone" in operation: - req = compute.zoneOperations().wait( + req = lkp.compute.zoneOperations().wait( project=project, zone=trim_self_link(operation["zone"]), operation=operation["name"], ) elif "region" in operation: - req = compute.regionOperations().wait( + req = lkp.compute.regionOperations().wait( project=project, region=trim_self_link(operation["region"]), operation=operation["name"], ) else: - req = compute.globalOperations().wait( + req = lkp.compute.globalOperations().wait( project=project, operation=operation["name"] ) return req -def wait_for_operation(operation, project=None, compute=None): +def wait_for_operation(operation): """wait for given operation""" - if not compute: - compute = globals()["compute"] - if project is None: - project = parse_self_link(operation["selfLink"]).project - wait_req = wait_request(operation, project=project, compute=compute) + project = parse_self_link(operation["selfLink"]).project + wait_req = wait_request(operation, project=project) while True: result = ensure_execute(wait_req) @@ -1167,28 +1156,15 @@ def wait_for_operation(operation, project=None, compute=None): return result -def wait_for_operations(operations, project=None, compute=None): - if not compute: - compute = globals()["compute"] +def wait_for_operations(operations): return [ - wait_for_operation(op, project=project, compute=compute) for op in operations + wait_for_operation(op) for op in operations ] -def get_filtered_operations( - op_filter, - zone=None, - region=None, - only_global=False, - project=None, - compute=None, -): +def get_filtered_operations(op_filter): """get list of operations associated with group id""" - - if not compute: - compute = globals()["compute"] - if project is None: - project = lkp.project + project = lkp.project operations = [] def get_aggregated_operations(items): @@ -1199,47 +1175,24 @@ def get_aggregated_operations(items): ) ) - def get_list_operations(items): - operations.extend(items) - - handle_items = get_list_operations - if only_global: - act = compute.globalOperations() - op = act.list(project=project, filter=op_filter) - nxt = act.list_next - elif zone is not None: - act = compute.zoneOperations() - op = act.list(project=project, zone=zone, filter=op_filter) - nxt = act.list_next - elif region is not None: - act = compute.regionOperations() - op = act.list(project=project, region=region, filter=op_filter) - nxt = act.list_next - else: - act = compute.globalOperations() - op = act.aggregatedList( - project=project, filter=op_filter, fields="items.*.operations,nextPageToken" - ) - nxt = act.aggregatedList_next - handle_items = get_aggregated_operations + act = lkp.compute.globalOperations() + op = act.aggregatedList( + project=project, filter=op_filter, fields="items.*.operations,nextPageToken" + ) + while op is not None: result = ensure_execute(op) - handle_items(result["items"]) - op = nxt(op, result) + get_aggregated_operations(result["items"]) + op = act.aggregatedList_next(op, result) return operations -def get_insert_operations(group_ids, flt=None, project=None, compute=None): +def get_insert_operations(group_ids): """get all insert operations from a list of operationGroupId""" - if not compute: - compute = globals()["compute"] - if project is None: - project = lkp.project if isinstance(group_ids, str): group_ids = group_ids.split(",") filters = [ "operationType=insert", - flt, " OR ".join(f"(operationGroupId={id})" for id in group_ids), ] return get_filtered_operations(" AND ".join(f"({f})" for f in filters if f)) @@ -2036,9 +1989,6 @@ def nodeset_map(self, hostnames: list): lkp = Lookup(cfg) -# Needs to be run after the lookup is complete to get endpoint versions -compute = compute_service() - if __name__ == "__main__": parser = argparse.ArgumentParser( From c1053e5a2b1401016958afeb9b54d864aeb3cbb0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 16 Jul 2024 22:29:12 -0700 Subject: [PATCH 027/118] Revert "Minor clean up of `slurm_files.bucket_dir`" --- .../modules/slurm_files/README.md | 2 +- .../modules/slurm_files/main.tf | 20 ++++++++++--------- .../modules/slurm_files/variables.tf | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 8620cf0cb5..7599f28af9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -59,7 +59,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | n/a | yes | +| [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index c3ba3cc4ab..8963baab17 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -16,6 +16,8 @@ locals { scripts_dir = abspath("${path.module}/scripts") + + bucket_dir = coalesce(var.bucket_dir, format("%s-files", var.slurm_cluster_name)) } ######## @@ -91,7 +93,7 @@ locals { } config_yaml = "config.yaml" - config_yaml_bucket = format("%s/%s", var.bucket_dir, local.config_yaml) + config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) partitions = { for p in var.partitions[*].partition : p.partition_name => p } @@ -106,7 +108,7 @@ locals { etc_dir = abspath("${path.module}/etc") - bucket_path = format("%s/%s", data.google_storage_bucket.this.url, var.bucket_dir) + bucket_path = format("%s/%s", data.google_storage_bucket.this.url, local.bucket_dir) slurm_control_host_port = coalesce(var.slurm_control_host_port, "6818") @@ -139,7 +141,7 @@ locals { build_dir = abspath("${path.module}/build") slurm_gcp_devel_zip = "slurm-gcp-devel.zip" - slurm_gcp_devel_zip_bucket = format("%s/%s", var.bucket_dir, local.slurm_gcp_devel_zip) + slurm_gcp_devel_zip_bucket = format("%s/%s", local.bucket_dir, local.slurm_gcp_devel_zip) } data "archive_file" "slurm_gcp_devel_zip" { @@ -180,7 +182,7 @@ resource "google_storage_bucket_object" "controller_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-controller-script-%s", var.bucket_dir, each.key) + name = format("%s/slurm-controller-script-%s", local.bucket_dir, each.key) content = each.value.content } @@ -191,7 +193,7 @@ resource "google_storage_bucket_object" "compute_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-compute-script-%s", var.bucket_dir, each.key) + name = format("%s/slurm-compute-script-%s", local.bucket_dir, each.key) content = each.value.content } @@ -205,7 +207,7 @@ resource "google_storage_bucket_object" "nodeset_startup_scripts" { ]]) : x.name => x.content } bucket = var.bucket_name - name = format("%s/%s", var.bucket_dir, each.key) + name = format("%s/%s", local.bucket_dir, each.key) content = each.value } @@ -216,7 +218,7 @@ resource "google_storage_bucket_object" "login_startup_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-login-script-%s", var.bucket_dir, each.key) + name = format("%s/slurm-login-script-%s", local.bucket_dir, each.key) content = each.value.content } @@ -227,7 +229,7 @@ resource "google_storage_bucket_object" "prolog_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-prolog-script-%s", var.bucket_dir, each.key) + name = format("%s/slurm-prolog-script-%s", local.bucket_dir, each.key) content = each.value.content source = each.value.source } @@ -239,7 +241,7 @@ resource "google_storage_bucket_object" "epilog_scripts" { } bucket = var.bucket_name - name = format("%s/slurm-epilog-script-%s", var.bucket_dir, each.key) + name = format("%s/slurm-epilog-script-%s", local.bucket_dir, each.key) content = each.value.content source = each.value.source } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 1983e07a0b..9872f8f5d6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -24,6 +24,7 @@ variable "bucket_name" { variable "bucket_dir" { description = "Bucket directory for cluster files to be put into." type = string + default = null } variable "enable_devel" { From d90d61f096a5b8d1e72ea75fcf93fe17e04dbde8 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 17 Jul 2024 18:37:51 +0000 Subject: [PATCH 028/118] Improve `pytest` flow --- .pytest.ini | 16 ++++++ .../slurm_files/scripts/tests/README.md | 6 --- .../slurm_files/scripts/tests/common.py | 53 +++++++++++++++++++ .../scripts/tests/test_topology.py | 40 +------------- .../slurm_files/scripts/tests/test_util.py | 6 +-- 5 files changed, 71 insertions(+), 50 deletions(-) create mode 100644 .pytest.ini delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py diff --git a/.pytest.ini b/.pytest.ini new file mode 100644 index 0000000000..7c21b17567 --- /dev/null +++ b/.pytest.ini @@ -0,0 +1,16 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[pytest] +filterwarnings = ignore::DeprecationWarning diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md deleted file mode 100644 index 8452813f25..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Unit tests - -```sh -# cwd is scripts/tests -$ pytest -W ignore::DeprecationWarning -``` diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py new file mode 100644 index 0000000000..11cc491cd2 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -0,0 +1,53 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional +import sys +from dataclasses import dataclass, field + +SCRIPTS_DIR = "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts" +if SCRIPTS_DIR not in sys.path: + sys.path.append(SCRIPTS_DIR) # TODO: make this more robust + + +# TODO: use "real" classes once they are defined (instead of NSDict) +@dataclass +class TstNodeset: + nodeset_name: str + node_count_static: int = 0 + node_count_dynamic_max: int = 0 + + +@dataclass +class TstCfg: + slurm_cluster_name: str = "m22" + nodeset: dict[str, TstNodeset] = field(default_factory=dict) + nodeset_tpu: dict[str, TstNodeset] = field(default_factory=dict) + output_dir: Optional[str] = None + + +@dataclass +class TstTPU: # to prevent client initialization durint "TPU.__init__" + vmcount: int + + +def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): + tbl = tbl or {} + + def se(k: str) -> list[str]: + if k not in tbl: + raise AssertionError(f"to_hostnames mock: unexpected nodelist: '{k}'") + return tbl[k] + + return se diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index 0b114e03e7..aedd35745a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -12,51 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional import mock -import sys +from common import TstCfg, TstNodeset, TstTPU, make_to_hostnames_mock -if ".." not in sys.path: - sys.path.append("..") # TODO: make this more robust import util import conf - -from dataclasses import dataclass, field import tempfile - -# TODO: use "real" classes once they are defined (instead of NSDict) -@dataclass -class TstNodeset: - nodeset_name: str - node_count_static: int = 0 - node_count_dynamic_max: int = 0 - - -@dataclass -class TstCfg: - slurm_cluster_name: str = "m22" - nodeset: dict[str, TstNodeset] = field(default_factory=dict) - nodeset_tpu: dict[str, TstNodeset] = field(default_factory=dict) - output_dir: Optional[str] = None - - -@dataclass -class TstTPU: # to prevent client initialization durint "TPU.__init__" - vmcount: int - - -def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): - tbl = tbl or {} - - def se(k: str) -> list[str]: - if k not in tbl: - raise AssertionError(f"to_hostnames mock: unexpected nodelist: '{k}'") - return tbl[k] - - return se - - def test_gen_topology_conf_empty(): cfg = TstCfg(output_dir=tempfile.mkdtemp()) conf.gen_topology_conf(util.Lookup(cfg)) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 5e11c3e65f..cf255669aa 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -12,17 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import pytest - -if ".." not in sys.path: - sys.path.append("..") # TODO: make this more robust +import common # needed to import util import util from google.api_core.client_options import ClientOptions # noqa: E402 # Note: need to install pytest-mock - @pytest.mark.parametrize( "name,expected", [ From 8ae58764cc79f0a07f2fe02a19fa3268b077b571 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 17 Jul 2024 22:56:39 +0000 Subject: [PATCH 029/118] Add `pytest` to pre-commit --- .github/workflows/pr-precommit.yml | 4 ++++ .pre-commit-config.yaml | 6 ++++++ .../slurm_files/scripts/requirements.txt | 17 +++++++++++++++++ .../slurm_files/scripts/tests/requirements.txt | 3 +++ 4 files changed, 30 insertions(+) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index 37234d2a0e..e1f56d0007 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -38,6 +38,10 @@ jobs: python-version: '3.10' check-latest: true cache: 'pip' + - run: > + pip install + -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt + -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt - uses: actions/setup-go@v5 with: go-version: '1.22' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89913de931..7cecfd5809 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -74,6 +74,12 @@ repos: files: 'tools/cloud-build/daily-tests/builds/.*\.yaml' pass_filenames: false require_serial: true + - id: pytest-check + name: pytest-check + entry: pytest + language: system + types: [python] + pass_filenames: false - repo: https://github.com/dnephin/pre-commit-golang rev: v0.5.1 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt new file mode 100644 index 0000000000..8de9450b3b --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt @@ -0,0 +1,17 @@ +addict==2.4.0 +google-api-core==2.19.0 +google-api-python-client==2.93.0 +google-auth==2.22.0 +google-auth-httplib2==0.1.0 +google-cloud-bigquery==3.11.3 +google-cloud-core==2.3.3 +google-cloud-storage==2.10.0 +google-cloud-tpu==1.10.0 +google-resumable-media==2.5.0 +googleapis-common-protos==1.59.1 +grpcio==1.56.0 +grpcio-status==1.56.0 +httplib2==0.22.0 +more-executors==2.11.4 +PyYAML==6.0 +requests==2.31.0 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt new file mode 100644 index 0000000000..85c80e7f84 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt @@ -0,0 +1,3 @@ +pytest +pytest-mock +mock From daf5aef399a6de853b6de155ac2e92f8b28c6d89 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 18 Jul 2024 00:46:19 +0000 Subject: [PATCH 030/118] Stop using `DEFAULT` node definition --- .../modules/slurm_files/scripts/conf.py | 20 ++---- .../slurm_files/scripts/tests/common.py | 19 +++++- .../slurm_files/scripts/tests/test_conf.py | 62 +++++++++++++++++++ 3 files changed, 84 insertions(+), 17 deletions(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 0dd81d2923..6f72272bcf 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -129,27 +129,24 @@ def nodeset_lines(nodeset, lkp: util.Lookup) -> str: # follow https://slurm.schedmd.com/slurm.conf.html#OPT_Boards # by setting Boards, SocketsPerBoard, CoresPerSocket, and ThreadsPerCore - node_def = { - "NodeName": "DEFAULT", - "State": "UNKNOWN", + gres = f"gpu:{template_info.gpu_count}" if template_info.gpu_count else None + node_conf = { "RealMemory": machine_conf.memory, "Boards": machine_conf.boards, "SocketsPerBoard": machine_conf.sockets_per_board, "CoresPerSocket": machine_conf.cores_per_socket, "ThreadsPerCore": machine_conf.threads_per_core, "CPUs": machine_conf.cpus, + "Gres": gres, **nodeset.node_conf, } - - gres = f"gpu:{template_info.gpu_count}" if template_info.gpu_count else None nodelist = lkp.nodelist(nodeset) return "\n".join( map( dict_to_conf, [ - node_def, - {"NodeName": nodelist, "State": "CLOUD", "Gres": gres}, + {"NodeName": nodelist, "State": "CLOUD", **node_conf}, {"NodeSet": nodeset.nodeset_name, "Nodes": nodelist}, ], ) @@ -157,19 +154,12 @@ def nodeset_lines(nodeset, lkp: util.Lookup) -> str: def nodeset_tpu_lines(nodeset, lkp: util.Lookup) -> str: - node_def = { - "NodeName": "DEFAULT", - "State": "UNKNOWN", - **nodeset.node_conf, - } nodelist = lkp.nodelist(nodeset) - return "\n".join( map( dict_to_conf, [ - node_def, - {"NodeName": nodelist, "State": "CLOUD"}, + {"NodeName": nodelist, "State": "CLOUD", **nodeset.node_conf}, {"NodeSet": nodeset.nodeset_name, "Nodes": nodelist}, ], ) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py index 11cc491cd2..c96bb97bd8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Optional, Any import sys from dataclasses import dataclass, field @@ -27,7 +27,8 @@ class TstNodeset: nodeset_name: str node_count_static: int = 0 node_count_dynamic_max: int = 0 - + node_conf: dict[str, Any] = field(default_factory=dict) + instance_template: Optional[str] = None @dataclass class TstCfg: @@ -41,6 +42,20 @@ class TstCfg: class TstTPU: # to prevent client initialization durint "TPU.__init__" vmcount: int +@dataclass +class TstMachineConf: + cpus: int + memory: int + sockets: int + sockets_per_board: int + cores_per_socket: int + boards: int + threads_per_core: int + + +@dataclass +class TstTemplateInfo: + gpu_count: int = 0 def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): tbl = tbl or {} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py new file mode 100644 index 0000000000..66c2175da0 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -0,0 +1,62 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from mock import Mock +from common import TstNodeset, TstCfg, TstMachineConf, TstTemplateInfo + +import conf +import util + + +def test_nodeset_tpu_lines(): + nodeset = TstNodeset( + "turbo", + node_count_static=2, + node_count_dynamic_max=3, + node_conf={"red": "velvet"}, + ) + assert conf.nodeset_tpu_lines(nodeset, util.Lookup(TstCfg())) == "\n".join( + [ + "NodeName=m22-turbo-[0-4] State=CLOUD red=velvet", + "NodeSet=turbo Nodes=m22-turbo-[0-4]", + ] + ) + + +def test_nodeset_lines(): + nodeset = TstNodeset( + "turbo", + node_count_static=2, + node_count_dynamic_max=3, + node_conf={"red": "velvet", "CPUs": 55}, + ) + lkp = util.Lookup(TstCfg()) + lkp.template_info = Mock(return_value=TstTemplateInfo(gpu_count=33)) + mc = TstMachineConf( + cpus=5, + memory=6, + sockets=7, + sockets_per_board=8, + boards=9, + threads_per_core=10, + cores_per_socket=11, + ) + lkp.template_machine_conf = Mock(return_value=mc) + assert conf.nodeset_lines(nodeset, lkp) == "\n".join( + [ + "NodeName=m22-turbo-[0-4] State=CLOUD RealMemory=6 Boards=9 SocketsPerBoard=8 CoresPerSocket=11 ThreadsPerCore=10 CPUs=55 Gres=gpu:33 red=velvet", + "NodeSet=turbo Nodes=m22-turbo-[0-4]", + ] + ) From e3aa0d7777d5a09917f790affa1482edac83fee3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 18 Jul 2024 01:07:39 +0000 Subject: [PATCH 031/118] Simplify SlurmGCP logging --- .../modules/slurm_files/scripts/resume.py | 46 +++------------ .../modules/slurm_files/scripts/slurmsync.py | 56 ++++--------------- .../modules/slurm_files/scripts/suspend.py | 32 ++--------- .../modules/slurm_files/scripts/util.py | 26 +++++++++ 4 files changed, 51 insertions(+), 109 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index d03217a29f..f539e4601d 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -635,7 +635,7 @@ def get_resume_file_data(): return NSDict(json.loads(resume_json)) -def main(nodelist, force=False): +def main(nodelist): """main called when run as script""" log.debug(f"ResumeProgram {nodelist}") # Filter out nodes not in config.yaml @@ -663,46 +663,16 @@ def main(nodelist, force=False): nodelist=nodelist, global_resume_data=global_resume_data ) - -parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter -) -parser.add_argument("nodelist", help="list of nodes to resume") -parser.add_argument( - "--force", - "-f", - "--static", - action="store_true", - help="Force attempted creation of the nodelist, whether nodes are exclusive or not.", -) -parser.add_argument( - "--debug", - "-d", - dest="loglevel", - action="store_const", - const=logging.DEBUG, - default=logging.INFO, - help="Enable debugging output", -) -parser.add_argument( - "--trace-api", - "-t", - action="store_true", - help="Enable detailed api request output", -) - - if __name__ == "__main__": - args = parser.parse_args() - - if cfg.enable_debug_logging: - args.loglevel = logging.DEBUG - if args.trace_api: - cfg.extra_logging_flags = list(cfg.extra_logging_flags) - cfg.extra_logging_flags.append("trace_api") + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("nodelist", help="list of nodes to resume") + + args = util.add_log_args_and_parse(parser) util.chown_slurm(LOGFILE, mode=0o600) util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) sys.excepthook = util.handle_exception global_resume_data = get_resume_file_data() - main(args.nodelist, args.force) + main(args.nodelist) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index e06e2c2edb..15252ff86b 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -523,52 +523,20 @@ def main(): log.exception("failed to sync custom scripts") -parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter -) -parser.add_argument( - "--debug", - "-d", - dest="loglevel", - action="store_const", - const=logging.DEBUG, - default=logging.INFO, - help="Enable debugging output", -) -parser.add_argument( - "--trace-api", - "-t", - action="store_true", - help="Enable detailed api request output", -) -parser.add_argument( - "--force", - "-f", - action="store_true", - help="Force tasks to run, regardless of lock.", -) - if __name__ == "__main__": - args = parser.parse_args() - util.chown_slurm(LOGFILE, mode=0o600) + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) - if cfg.enable_debug_logging: - args.loglevel = logging.DEBUG - if args.trace_api: - cfg.extra_logging_flags = list(cfg.extra_logging_flags) - cfg.extra_logging_flags.append("trace_api") + args = util.add_log_args_and_parse(parser) + util.chown_slurm(LOGFILE, mode=0o600) util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) - sys.excepthook = util.handle_exception - # only run one instance at a time unless --force - if args.force: - main() - else: - pid_file = (Path("/tmp") / Path(__file__).name).with_suffix(".pid") - with pid_file.open("w") as fp: - try: - fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) - main() - except BlockingIOError: - sys.exit(0) + pid_file = (Path("/tmp") / Path(__file__).name).with_suffix(".pid") + with pid_file.open("w") as fp: + try: + fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB) + main() + except BlockingIOError: + sys.exit(0) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index 0acae42e38..84cb2274d8 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -146,35 +146,13 @@ def main(nodelist): suspend_nodes(pm_nodes) -parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter -) -parser.add_argument("nodelist", help="list of nodes to suspend") -parser.add_argument( - "--debug", - "-d", - dest="loglevel", - action="store_const", - const=logging.DEBUG, - default=logging.INFO, - help="Enable debugging output", -) -parser.add_argument( - "--trace-api", - "-t", - action="store_true", - help="Enable detailed api request output", -) - - if __name__ == "__main__": - args = parser.parse_args() + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("nodelist", help="list of nodes to suspend") - if cfg.enable_debug_logging: - args.loglevel = logging.DEBUG - if args.trace_api: - cfg.extra_logging_flags = list(cfg.extra_logging_flags) - cfg.extra_logging_flags.append("trace_api") + args = util.add_log_args_and_parse(parser) util.chown_slurm(LOGFILE, mode=0o600) util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) log = logging.getLogger(Path(__file__).name) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 8a65532fc9..597ed920a4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -582,6 +582,32 @@ def owned_file_handler(filename): chown_slurm(filename) return logging.handlers.WatchedFileHandler(filename, delay=True) +def add_log_args_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: + parser.add_argument( + "--debug", + "-d", + dest="loglevel", + action="store_const", + const=logging.DEBUG, + default=logging.INFO, + help="Enable debugging output", + ) + parser.add_argument( + "--trace-api", + "-t", + action="store_true", + help="Enable detailed api request output", + ) + args = parser.parse_args() + + if cfg.enable_debug_logging: + args.loglevel = logging.DEBUG + if args.trace_api: + cfg.extra_logging_flags = list(cfg.extra_logging_flags) + cfg.extra_logging_flags.append("trace_api") + + return args + def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): """configure the root logger, disabling all existing loggers""" From d6e54664e3c233633ea61c0b07ec23c00ebf5cb9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:37:13 +0000 Subject: [PATCH 032/118] Bump django from 4.2.11 to 4.2.14 in /community/front-end/ofe Bumps [django](https://github.com/django/django) from 4.2.11 to 4.2.14. - [Commits](https://github.com/django/django/compare/4.2.11...4.2.14) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 6f57f7f7fa..ae4cea0904 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.11 +Django==4.2.14 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.15.2 From 963a3367f629926270ebffb0aaa1da7303e6a6c9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:37:41 +0000 Subject: [PATCH 033/118] Bump google.golang.org/grpc from 1.64.0 to 1.64.1 Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.64.0 to 1.64.1. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.64.0...v1.64.1) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-type: indirect ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 7a7435fb98..2d9fd88617 100644 --- a/go.mod +++ b/go.mod @@ -100,7 +100,7 @@ require ( golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.21.0 golang.org/x/text v0.16.0 // indirect - google.golang.org/grpc v1.64.0 // indirect + google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 0c9ffbf1dc..462c9840ba 100644 --- a/go.sum +++ b/go.sum @@ -1019,8 +1019,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= -google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= +google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From 14011216a62b945148bc59aee794016b49ae1e2b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 2 Jul 2024 04:05:04 +0000 Subject: [PATCH 034/118] Add parallelstore slurm blueprint and documentation --- examples/README.md | 67 ++++++++++++- examples/pfs-parallelstore.yaml | 66 +++++++++++++ examples/ps-slurm.yaml | 75 +++++++++++++++ modules/file-system/parallelstore/README.md | 94 +++++++++++++++++++ .../pre-existing-network-storage/README.md | 15 +++ .../daily-tests/builds/ps-slurm.yaml | 44 +++++++++ .../daily-tests/tests/ps-slurm.yml | 37 ++++++++ 7 files changed, 396 insertions(+), 2 deletions(-) create mode 100644 examples/pfs-parallelstore.yaml create mode 100644 examples/ps-slurm.yaml create mode 100644 tools/cloud-build/daily-tests/builds/ps-slurm.yaml create mode 100644 tools/cloud-build/daily-tests/tests/ps-slurm.yml diff --git a/examples/README.md b/examples/README.md index 8519cdcaa9..c3e1500169 100644 --- a/examples/README.md +++ b/examples/README.md @@ -32,6 +32,8 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] + * [ps-slurm.yaml](#ps-slurmyaml--) ![core-badge] ![experimental-badge] + * [pfs-parallelstore.yaml](#pfs-parallelstoreyaml--) ![core-badge] ![experimental-badge] * [cae-slurm-v5-legacy.yaml](#cae-slurm-v5-legacyyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge] @@ -994,6 +996,67 @@ For this example the following is needed in the selected region: [pfs-lustre.yaml]: ./pfs-lustre.yaml +### [ps-slurm.yaml] ![core-badge] ![experimental-badge] + +Creates a Slurm cluster with [Parallelstore] instance mounted. + +After cluster is deployed, parallelstore drivers and DAOS client will be installed +and mount-point will be configured on the VMs. You can SSH to login/ controller +and verify by running: + +```sh +df -H +``` + +This would show `dfuse` file system being attached at `/parallelstore` mount-point. + +#### Quota Requirements for ps-slurm.yaml + +To get access to a private preview of Parallelstore APIs, your project needs to +be allowlisted. To set this up, please work with your account representative. + +For this example the following is needed in the selected region: + +* Cloud Parallelstore API: capacity (GB) per region: 12000 GB +* Compute Engine API: Persistent Disk SSD (GB): ~100 GB for controller and login node. +* Compute Engine API: Persistent Disk Standard (GB): 50 GB/node up to 200 GB. +* Compute Engine API: N2 CPUs: 2 for the login node and 2/node active in the `debug` partition. +* Compute Engine API: C2 CPUs: 4 for the controller node. +* Compute Engine API: C2 CPUs: 60/node active in the `debug` partition up to 240. + +[ps-slurm.yaml]: ./ps-slurm.yaml +[Parallelstore]: ../modules/file-system/parallelstore/README.md + +### [pfs-parallelstore.yaml] ![core-badge] ![experimental-badge] + +This creates 1 compute VM running debian 12 and 1 compute VM running ubuntu 20.04 +and connect with [Parallelstore] instance mounted. + +After cluster is deployed, parallelstore drivers and DAOS client will be installed +and mount-point will be configured on the VMs. You can SSH to compute VM +and verify by running: + +```sh +df -H +``` + +This would show `dfuse` file system being attached at `/parallelstore` mount-point. + +#### Quota Requirements for pfs-parallelstore.yaml + +To get access to a private preview of Parallelstore APIs, your project needs to +be allowlisted. To set this up, please work with your account representative. + +For this example the following is needed in the selected region: + +* Cloud Parallelstore API: capacity (GB) per region: 12000 GB +* Compute Engine API: Persistent Disk Standard (GB): ~100 GB static. +* Compute Engine API: N2 CPUs: 112 for the compute VM running debian 12. +* Compute Engine API: N2 CPUs: 112 for the compute VM running ubuntu 22.04. + +[pfs-parallelstore.yaml]: ./pfs-parallelstore.yaml +[Parallelstore]: ../modules/file-system/parallelstore/README.md + ### [cae-slurm-v5-legacy.yaml] ![core-badge] The Computer Aided Engineering (CAE) blueprint captures a reference architecture @@ -1386,9 +1449,9 @@ Toolkit. It includes: Note: This blueprint has also been tested with `a2` machines, but as capacity is hard to find the example uses `g2` machines which have better obtainability. If using with `a2` machines it is recommended to first obtain an automatic reservation. - + Example settings for a2 look like: - + ```yaml source: modules/compute/gke-node-pool use: [gke_cluster] diff --git a/examples/pfs-parallelstore.yaml b/examples/pfs-parallelstore.yaml new file mode 100644 index 0000000000..eac758f660 --- /dev/null +++ b/examples/pfs-parallelstore.yaml @@ -0,0 +1,66 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# To get access to a private preview of Parallelstore APIs, your project needs to +# be allowlisted. To set this up, please work with your account representative. + +blueprint_name: parallelstore-vm + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: parallelstore-vm + region: us-central1 + zone: us-central1-a + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + + - id: private_service_access + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 # recommended to use <=24 + + - id: parallelstore + source: modules/file-system/parallelstore + use: [network, private_service_access] + + # Connect parallelstore instance with Compute VM running debian 12. + - id: debian_instance + source: modules/compute/vm-instance + use: [network, parallelstore] + settings: + name_prefix: debian + instance_count: 1 + instance_image: + family: debian-12 + project: debian-cloud + machine_type: c2d-standard-112 + + # Connect parallelstore instance with Compute VM running ubuntu 22.04. + - id: ubuntu_instance + source: modules/compute/vm-instance + use: [network, parallelstore] + settings: + name_prefix: ubuntu + instance_count: 1 + instance_image: + family: ubuntu-2204-lts + project: ubuntu-os-cloud + machine_type: c2d-standard-112 diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml new file mode 100644 index 0000000000..4a28802924 --- /dev/null +++ b/examples/ps-slurm.yaml @@ -0,0 +1,75 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# To get access to a private preview of Parallelstore APIs, your project needs to +# be allowlisted. To set this up, please work with your account representative. + +blueprint_name: parallelstore-slurm + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: parallelstore-slurm + region: us-east4 + zone: us-east4-b + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + + - id: private_service_access + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 # recommended to use <=24 + + - id: parallelstore + source: modules/file-system/parallelstore + use: [network, private_service_access] + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + node_count_dynamic_max: 4 + machine_type: c2-standard-60 + enable_placement: false # the default is: true + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset] + settings: + partition_name: debug + exclusive: false # allows nodes to stay up after jobs are done. + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + machine_type: n2-standard-4 + enable_login_public_ips: true + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - debug_partition + - slurm_login + - parallelstore # Connect parallelstore instance with slurm cluster. + settings: + enable_controller_public_ips: true diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md index 38bc60114f..37b4788b53 100644 --- a/modules/file-system/parallelstore/README.md +++ b/modules/file-system/parallelstore/README.md @@ -1,3 +1,97 @@ +## Description + +This module creates [parallelstore](https://cloud.google.com/parallelstore) +instance. Parallelstore is Google Cloud's first party parallel file system +service based on [Intel DAOS](https://docs.daos.io/v2.2/) + +### Parallelstore Quota + +To get access to a private preview of Parallelstore APIs, your project needs to +be allowlisted. To set this up, please work with your account representative. + +### Parallelstore mount options + +After parallelstore instance is created, you can specify mount options depending +upon your workload. DAOS is configured to deliver the best user experience for +interactive workloads with aggressive caching. If you are running parallel +workloads concurrently accessing the sane files from multiple client nodes, it +is recommended to disable the writeback cache to avoid cross-client consistency +issues. You can specify different mount options as follows, + +```yaml + - id: parallelstore + source: modules/file-system/parallelstore + use: [network, ps_connect] + settings: + mount_options: "disable-wb-cache,thread-count=20,eq-count=8" +``` + +### Example - New VPC + +For parallelstore instance, Below snippet creates new VPC and configures private-service-access +for this newly created network. + +The parallelstore instance created here can be used with Slurm cluster or compute +VM running Ubuntu 22.04, debian 12 or HPC Rocky Linux 8. + +```yaml + - id: network + source: modules/network/vpc + + - id: private_service_access + source: community/modules/network/private-service-access + use: [network] + settings: + prefix_length: 24 + + - id: parallelstore + source: modules/file-system/parallelstore + use: [network, private_service_access] +``` + +### Example - Existing VPC + +If you want to use existing network with private-service-access configured, you need +to manually provide `private_vpc_connection_peering` to the parallelstore module. +You can get this details from the Google Cloud Console UI in `VPC network peering` +section. Below is the example of using existing network and creating parallelstore. +If existing network is not configured with private-service-access, you can follow +[Configure private service access](https://cloud.google.com/vpc/docs/configure-private-services-access) +to set it up. + +```yaml + - id: network + source: modules/network/pre-existing-vpc + settings: + network_name: // Add network name + subnetwork_name: // Add subnetwork name + + - id: parallelstore + source: modules/file-system/parallelstore + use: [network] + settings: + private_vpc_connection_peering: # will look like "servicenetworking.googleapis.com" +``` + +### Import data from GCS bucket + +You can import data from your GCS bucket to parallelstore instance. Important to +note that data may not be available to the instance immediately. This depends on +latency and size of data. Below is the example of importing data from bucket. + +```yaml + - id: parallelstore + source: modules/file-system/parallelstore + use: [network] + settings: + import_gcs_bucket_uri: gs://gcs-bucket/folder-path + import_destination_path: /gcs/import/ +``` + +Here you can replace `import_gcs_bucket_uri` with the uri of sub folder within GCS +bucket and `import_destination_path` with local directory within parallelstore +instance. + Copyright 2024 Google LLC diff --git a/modules/file-system/pre-existing-network-storage/README.md b/modules/file-system/pre-existing-network-storage/README.md index 2580169785..7a05ddc122 100644 --- a/modules/file-system/pre-existing-network-storage/README.md +++ b/modules/file-system/pre-existing-network-storage/README.md @@ -60,6 +60,20 @@ filesystem: Note the use of the MGS NID (Network ID) in the `server_ip` field - in particular, note the `@tcp` suffix. +The following is an example of using `pre-existing-network-storage` with the `daos` +filesystem. In order to use existing `parallelstore` instance, `fs_type` needs to be +explicitly mentioned in blueprint. The `remote_mount` option refers to `access_points` +for `parallelstore` instance. + +```yaml +- id: parallelstorefs + source: modules/file-system/pre-existing-network-storage + settings: + fs_type: daos + remote_mount: "[10.246.99.2,10.246.99.3,10.246.99.4]" + mount_options: disable-wb-cache,thread-count=16,eq-count=8 +``` + ### Mounting For the `fs_type` listed below, this module will provide `client_install_runner` @@ -71,6 +85,7 @@ Supported `fs_type`: - nfs - lustre - gcsfuse +- daos [scripts/mount.sh](./scripts/mount.sh) is used as the contents of `mount_runner`. This script will update `/etc/fstab` and mount the network diff --git a/tools/cloud-build/daily-tests/builds/ps-slurm.yaml b/tools/cloud-build/daily-tests/builds/ps-slurm.yaml new file mode 100644 index 0000000000..c2e2a014b7 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/ps-slurm.yaml @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.parallelstore +- m.vpc +- m.private-service-access +- m.schedmd-slurm-gcp-v6-controller +- m.schedmd-slurm-gcp-v6-login +- m.schedmd-slurm-gcp-v6-nodeset +- m.schedmd-slurm-gcp-v6-partition +- slurm6 + +timeout: 14400s # 4hr +steps: +- id: ps-slurm + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/ps-slurm.yml" diff --git a/tools/cloud-build/daily-tests/tests/ps-slurm.yml b/tools/cloud-build/daily-tests/tests/ps-slurm.yml new file mode 100644 index 0000000000..f487f4bde5 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/ps-slurm.yml @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +test_name: ps-slurm +deployment_name: "ps-slurm-{{ build }}" +region: us-central1 +zone: us-central1-a +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/ps-slurm.yaml" +network: "{{ deployment_name }}-net" +slurm_cluster_name: "psslurm{{ build[0:3] }}" +cli_deployment_vars: + region: "{{ region }}" + zone: "{{ zone }}" +# Note: Pattern matching in gcloud only supports 1 wildcard. +login_node: "{{ slurm_cluster_name }}-slurm-login-*" +controller_node: "{{ slurm_cluster_name }}-controller" +post_deploy_tests: +- test-validation/test-partitions.yml +- test-validation/test-mounts.yml +custom_vars: + mounts: + - /parallelstore + partitions: + - debug From b273336153a3a4145b96e3312c575dedff06dbca Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:56:01 +0000 Subject: [PATCH 035/118] Update internal usage of toolkit modules on develop to v1.36.0 --- .../modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/main.tf | 2 +- community/modules/compute/pbspro-execution/README.md | 6 +++--- community/modules/compute/pbspro-execution/main.tf | 6 +++--- .../remote-desktop/chrome-remote-desktop/README.md | 4 ++-- .../modules/remote-desktop/chrome-remote-desktop/main.tf | 4 ++-- .../modules/scheduler/htcondor-access-point/README.md | 2 +- community/modules/scheduler/htcondor-access-point/main.tf | 2 +- .../modules/scheduler/htcondor-central-manager/README.md | 2 +- .../modules/scheduler/htcondor-central-manager/main.tf | 2 +- .../modules/scheduler/htcondor-service-accounts/README.md | 6 +++--- .../modules/scheduler/htcondor-service-accounts/main.tf | 6 +++--- community/modules/scheduler/pbspro-client/README.md | 6 +++--- community/modules/scheduler/pbspro-client/main.tf | 6 +++--- community/modules/scheduler/pbspro-server/README.md | 8 ++++---- community/modules/scheduler/pbspro-server/main.tf | 8 ++++---- .../scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../schedmd-slurm-gcp-v6-controller/slurm_files.tf | 2 +- community/modules/scripts/ramble-execute/README.md | 2 +- community/modules/scripts/ramble-execute/main.tf | 2 +- community/modules/scripts/ramble-setup/README.md | 2 +- community/modules/scripts/ramble-setup/main.tf | 2 +- community/modules/scripts/spack-execute/README.md | 2 +- community/modules/scripts/spack-execute/main.tf | 2 +- community/modules/scripts/spack-setup/README.md | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- modules/compute/vm-instance/README.md | 2 +- .../compute/vm-instance/startup_from_network_storage.tf | 2 +- modules/network/multivpc/README.md | 2 +- modules/network/multivpc/main.tf | 2 +- modules/scheduler/batch-job-template/README.md | 2 +- .../batch-job-template/startup_from_network_storage.tf | 2 +- modules/scheduler/batch-login-node/README.md | 2 +- modules/scheduler/batch-login-node/main.tf | 2 +- 34 files changed, 54 insertions(+), 54 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 88d0909b98..65ac13a3de 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -212,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 9f015bbb57..697e446e35 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -125,7 +125,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index e8334fdab2..96936e8841 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | +| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index 172712e20d..b3e46cc2a8 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -42,7 +42,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.36.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 64894f4b09..1db208c5ad 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,8 +63,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | +| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 8369dc9b12..936cc75d3c 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 15e760c653..8ef8700f93 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -122,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 73dc845 | | [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index a8d3e9ba12..68bf8ace00 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -171,7 +171,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 1f298cf64a..14c2ec7829 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -106,7 +106,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 293ebce7d3..21003d3cae 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -110,7 +110,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-service-accounts/README.md b/community/modules/scheduler/htcondor-service-accounts/README.md index 409c707d3e..a8a212747d 100644 --- a/community/modules/scheduler/htcondor-service-accounts/README.md +++ b/community/modules/scheduler/htcondor-service-accounts/README.md @@ -100,9 +100,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.35.0&depth=1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.36.0&depth=1 | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.36.0&depth=1 | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/htcondor-service-accounts/main.tf b/community/modules/scheduler/htcondor-service-accounts/main.tf index 028fce2145..38c4d7b7c7 100644 --- a/community/modules/scheduler/htcondor-service-accounts/main.tf +++ b/community/modules/scheduler/htcondor-service-accounts/main.tf @@ -21,7 +21,7 @@ # require them module "access_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.36.0&depth=1" project_id = var.project_id display_name = "HTCondor Access Point" @@ -31,7 +31,7 @@ module "access_point_service_account" { } module "execute_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.36.0&depth=1" project_id = var.project_id display_name = "HTCondor Execute Point" @@ -41,7 +41,7 @@ module "execute_point_service_account" { } module "central_manager_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.36.0&depth=1" project_id = var.project_id display_name = "HTCondor Central Manager" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 4cabd2e375..590e308b43 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | +| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index 0d8c8e1247..e427c3945e 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.36.0&depth=1" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 85ad925986..8973b62b71 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -69,10 +69,10 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.35.0&depth=1 | -| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.35.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.35.0&depth=1 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | +| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.36.0&depth=1 | +| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 8341e68564..bfbf635aa4 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.36.0&depth=1" pbs_data_service_user = var.pbs_data_service_user pbs_exec = var.pbs_exec @@ -45,7 +45,7 @@ module "pbs_install" { } module "pbs_qmgr" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.36.0&depth=1" client_host_count = var.client_host_count client_hostname_prefix = var.client_hostname_prefix @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" deployment_name = var.deployment_name project_id = var.project_id @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 53d1ad8d41..dcd557644b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -195,7 +195,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.34.0&depth=1 | +| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | | [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 0ed1e16c20..b13525abea 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -122,7 +122,7 @@ locals { module "daos_network_storage_scripts" { count = length(local.daos_ns) > 0 ? 1 : 0 - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.34.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index aa752fe0e2..7479ef5355 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index a44b9838a3..96b5e07488 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index d62bb3affe..b1413bf341 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 6f2462b279..6e08568505 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -94,7 +94,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 58595c2861..e91f2a0b7e 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index 93a121c2de..7cd0067204 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 3649c58f36..885a67e4a2 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -340,7 +340,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index 356e745715..679f6184ee 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -100,7 +100,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 01ec6f6bd1..f7a1d975b0 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -185,7 +185,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | b83107e0 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0 | ## Resources diff --git a/modules/compute/vm-instance/startup_from_network_storage.tf b/modules/compute/vm-instance/startup_from_network_storage.tf index 993952ad20..75080b2216 100644 --- a/modules/compute/vm-instance/startup_from_network_storage.tf +++ b/modules/compute/vm-instance/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=b83107e0" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0" labels = local.labels project_id = var.project_id diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index 83b5d1a524..f0f55175c9 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -88,7 +88,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.35.0&depth=1 | +| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.36.0&depth=1 | ## Resources diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index 782a3f45a4..f52b826808 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -44,7 +44,7 @@ resource "terraform_data" "global_ip_cidr_suffix" { } module "vpcs" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.36.0&depth=1" count = var.network_count diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 2ac9dafc81..7050cf9a57 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -139,7 +139,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 10.1.1 | -| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | b83107e0 | +| [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0 | ## Resources diff --git a/modules/scheduler/batch-job-template/startup_from_network_storage.tf b/modules/scheduler/batch-job-template/startup_from_network_storage.tf index 993952ad20..75080b2216 100644 --- a/modules/scheduler/batch-job-template/startup_from_network_storage.tf +++ b/modules/scheduler/batch-job-template/startup_from_network_storage.tf @@ -55,7 +55,7 @@ locals { } module "netstorage_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=b83107e0" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0" labels = local.labels project_id = var.project_id diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 4eb7da8a3a..761c9fdb4b 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.35.0&depth=1 | +| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index 50960758cd..3ef185b728 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.35.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.36.0&depth=1" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name From 045355b3e89edbb1a54f3005c0f926512cff3213 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 19 Jul 2024 21:47:06 +0000 Subject: [PATCH 036/118] Clean up SlurmGCP python importing --- .../slurm_files/scripts/requirements.txt | 3 +- .../scripts/setup_network_storage.py | 3 +- .../modules/slurm_files/scripts/util.py | 52 ++++--------------- 3 files changed, 12 insertions(+), 46 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt index 8de9450b3b..3783cd6977 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt @@ -5,6 +5,7 @@ google-auth==2.22.0 google-auth-httplib2==0.1.0 google-cloud-bigquery==3.11.3 google-cloud-core==2.3.3 +google-cloud-secret-manager~=2.0 google-cloud-storage==2.10.0 google-cloud-tpu==1.10.0 google-resumable-media==2.5.0 @@ -13,5 +14,5 @@ grpcio==1.56.0 grpcio-status==1.56.0 httplib2==0.22.0 more-executors==2.11.4 -PyYAML==6.0 +pyyaml==6.0 requests==2.31.0 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index b3283dd341..cb7d93e9d4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -27,6 +27,7 @@ import util from util import lkp, run, cfg, dirs, separate +from more_executors import Executors, ExceptionRetryPolicy def mounts_by_local(mounts): @@ -158,8 +159,6 @@ def setup_network_storage(log): def mount_fstab(mounts, log): """Wait on each mount, then make sure all fstab is mounted""" - from more_executors import Executors, ExceptionRetryPolicy - def mount_path(path): log.info(f"Waiting for '{path}' to be mounted...") try: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 8a65532fc9..474d368b60 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -19,7 +19,6 @@ import base64 import collections import hashlib -import importlib.util import inspect import json import logging @@ -45,29 +44,8 @@ import slurm_gcp_plugins -required_modules = [ - ("googleapiclient", "google-api-python-client"), - ("requests", "requests"), - ("yaml", "yaml"), - ("addict", "addict"), - ("httplib2", "httplib2"), - ("google.cloud.tpu_v2", "google-cloud-tpu"), -] -missing_imports = False -can_tpu = True -for module, name in required_modules: - if importlib.util.find_spec(module) is None: - if module == "google.cloud.tpu_v2": - can_tpu = False - print( - f"WARNING: Missing Python module '{module} (pip:{name})', TPU support will not work." - ) - else: - missing_imports = True - print(f"ERROR: Missing Python module '{module} (pip:{name})'") -if missing_imports: - print("Aborting due to missing Python modules") - exit(1) +from google.cloud import secretmanager +from google.cloud import storage import google.auth # noqa: E402 from google.oauth2 import service_account # noqa: E402 @@ -77,8 +55,13 @@ from google.api_core.client_options import ClientOptions # noqa: E402 import httplib2 # noqa: E402 -if can_tpu: +try: from google.cloud import tpu_v2 as tpu # noqa: E402 + can_tpu = True +except ImportError: # TODO: remove once CentOS 7 is deprecated or dependency is added + f"WARNING: Missing Python module 'google.cloud.tpu_v2 (pip:google-cloud-tpu)', TPU support will not work." + can_tpu = False + import google.api_core.exceptions as gExceptions # noqa: E402 from requests import get as get_url # noqa: E402 @@ -87,12 +70,6 @@ import yaml # noqa: E402 from addict import Dict as NSDict # noqa: E402 -optional_modules = [ - ("google.cloud.secretmanager", "google-cloud-secret-manager"), -] -for module, name in optional_modules: - if importlib.util.find_spec(module) is None: - print(f"WARNING: Missing Python module '{module}' (pip:{name}) ") USER_AGENT = "Slurm_GCP_Scripts/1.5 (GPN:SchedMD)" ENV_CONFIG_YAML = os.getenv("SLURM_CONFIG_YAML") @@ -273,9 +250,6 @@ def access_secret_version(project_id, secret_id, version_id="latest"): Access the payload for the given secret version if one exists. The version can be a version number as a string (e.g. "5") or an alias (e.g. "latest"). """ - from google.cloud import secretmanager - from google.api_core import exceptions - co = create_client_options(ApiEndpoint.SECRET) client = secretmanager.SecretManagerServiceClient(client_options=co) name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" @@ -283,7 +257,7 @@ def access_secret_version(project_id, secret_id, version_id="latest"): response = client.access_secret_version(request={"name": name}) log.debug(f"Secret '{name}' was found.") payload = response.payload.data.decode("UTF-8") - except exceptions.NotFound: + except gExceptions.NotFound: log.debug(f"Secret '{name}' was not found!") payload = None @@ -350,8 +324,6 @@ def map_with_futures(func, seq): def blob_get(file, project=None): - from google.cloud import storage - if project is None: project = lkp.project uri = instance_metadata("attributes/slurm_bucket_path") @@ -363,8 +335,6 @@ def blob_get(file, project=None): def blob_list(prefix="", delimiter=None, project=None): - from google.cloud import storage - if project is None: project = lkp.project uri = instance_metadata("attributes/slurm_bucket_path") @@ -541,8 +511,6 @@ def fetch_config_yaml(): def fetch_config_yaml_md5(): """Fetch config.yaml blob md5 from bucket""" - import hashlib - blob = blob_get("config.yaml") blob.reload() # Populate blob with metadata hash_str = str(blob.md5_hash).encode(encoding="utf-8") @@ -903,8 +871,6 @@ def project_metadata(key): def bucket_blob_download(bucket_name, blob_name): - from google.cloud import storage - co = create_client_options("storage") storage_client = storage.Client(client_options=co) bucket = storage_client.bucket(bucket_name) From 5ef9f47446efe68074c6d136f19d00061df7f4ba Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 19 Jul 2024 23:29:08 +0000 Subject: [PATCH 037/118] Simplify SlurmGCP logging --- .../modules/slurm_files/scripts/util.py | 101 +++--------------- 1 file changed, 12 insertions(+), 89 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 597ed920a4..99400ca535 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -216,56 +216,8 @@ def create_client_options(api: Optional[ApiEndpoint] = None) -> ClientOptions: log.debug(f"Using ClientOptions = {co} for API: {api}") return co - -class LogFormatter(logging.Formatter): - """adds logging flags to the levelname in log records""" - - def format(self, record): - new_fmt = self._fmt - flag = getattr(record, "flag", None) - if flag is not None: - start, level, end = new_fmt.partition("%(levelname)s") - if level: - new_fmt = f"{start}{level}(%(flag)s){end}" - # insert function name if record level is DEBUG - if record.levelno < logging.INFO: - prefix, msg, suffix = new_fmt.partition("%(message)s") - new_fmt = f"{prefix}%(funcName)s: {msg}{suffix}" - self._style._fmt = new_fmt - return super().format(record) - - -class FlagLogAdapter(logging.LoggerAdapter): - """creates log adapters that add a flag to the log record, - allowing it to be filtered""" - - def __init__(self, logger, flag, extra=None): - if extra is None: - extra = {} - self.flag = flag - super().__init__(logger, extra) - - @property - def enabled(self): - return cfg.extra_logging_flags.get(self.flag, False) - - def process(self, msg, kwargs): - extra = kwargs.setdefault("extra", {}) - extra.update(self.extra) - extra["flag"] = self.flag - return msg, kwargs - - logging.basicConfig(level=logging.INFO, stream=sys.stdout) log = logging.getLogger(__name__) -logging_flags = [ - "trace_api", - "subproc", - "hostlists", -] -log_trace_api = FlagLogAdapter(log, "trace_api") -log_subproc = FlagLogAdapter(log, "subproc") -log_hostlists = FlagLogAdapter(log, "hostlists") def access_secret_version(project_id, secret_id, version_id="latest"): @@ -502,9 +454,6 @@ def load_config_data(config): if not cfg.enable_debug_logging and isinstance(cfg.enable_debug_logging, NSDict): cfg.enable_debug_logging = False - cfg.extra_logging_flags = NSDict( - {flag: cfg.extra_logging_flags.get(flag, False) for flag in logging_flags} - ) return cfg @@ -564,17 +513,6 @@ def save_config(cfg, path): """save given config to file at path""" Path(path).write_text(yaml.dump(cfg, Dumper=Dumper)) - -def filter_logging_flags(record): - """logging filter for flags - if there are no flags, always pass. If there are flags, only pass if a flag - matches an enabled flag in cfg.extra_logging_flags""" - flag = getattr(record, "flag", None) - if flag is None: - return True - return cfg.extra_logging_flags.get(flag, False) - - def owned_file_handler(filename): """create file handler""" if filename is None: @@ -618,30 +556,23 @@ def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): "disable_existing_loggers": True, "formatters": { "standard": { - "()": LogFormatter, "fmt": "%(levelname)s: %(message)s", }, "stamp": { - "()": LogFormatter, "fmt": "%(asctime)s %(levelname)s: %(message)s", }, }, - "filters": { - "logging_flags": {"()": lambda: filter_logging_flags}, - }, "handlers": { "stdout_handler": { "level": logging.DEBUG, "formatter": "standard", "class": "logging.StreamHandler", "stream": sys.stdout, - "filters": ["logging_flags"], }, "file_handler": { "()": owned_file_handler, "level": logging.DEBUG, "formatter": "stamp", - "filters": ["logging_flags"], "filename": logfile, }, }, @@ -667,15 +598,17 @@ def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): def log_api_request(request): """log.trace info about a compute API request""" - if log_trace_api.enabled: - # output the whole request object as pretty yaml - # the body is nested json, so load it as well - rep = json.loads(request.to_json()) - if rep.get("body", None) is not None: - rep["body"] = json.loads(rep["body"]) - pretty_req = yaml.safe_dump(rep).rstrip() - # label log message with the calling function - log_trace_api.debug(f"{inspect.stack()[1].function}:\n{pretty_req}") + if not cfg.extra_logging_flags.get("trace_api", False): + return + + # output the whole request object as pretty yaml + # the body is nested json, so load it as well + rep = json.loads(request.to_json()) + if rep.get("body", None) is not None: + rep["body"] = json.loads(rep["body"]) + pretty_req = yaml.safe_dump(rep).rstrip() + # label log message with the calling function + log.debug(f"{inspect.stack()[1].function}:\n{pretty_req}") def handle_exception(exc_type, exc_value, exc_trace): @@ -702,7 +635,7 @@ def run( args = " ".join(args) if not shell and isinstance(args, str): args = shlex.split(args) - log_subproc.debug(f"run: {args}") + log.debug(f"run: {args}") result = subprocess.run( args, stdout=stdout, @@ -716,14 +649,6 @@ def run( return result -def spawn(cmd, quiet=False, shell=False, **kwargs): - """nonblocking spawn of subprocess""" - if not quiet: - log_subproc.debug(f"spawn: {cmd}") - args = cmd if shell else shlex.split(cmd) - return subprocess.Popen(args, shell=shell, **kwargs) - - def chown_slurm(path: Path, mode=None) -> None: if path.exists(): if mode: @@ -959,7 +884,6 @@ def to_hostlist(nodenames) -> str: tmp_file.close() hostlist = run(f"{lkp.scontrol} show hostlist {tmp_file.name}").stdout.rstrip() - log_hostlists.debug(f"hostlist({len(nodenames)}): {hostlist}".format(hostlist)) os.remove(tmp_file.name) return hostlist @@ -1049,7 +973,6 @@ def to_hostnames(nodelist: str) -> List[str]: else: hostlist = ",".join(nodelist) hostnames = run(f"{lkp.scontrol} show hostnames {hostlist}").stdout.splitlines() - log_hostlists.debug(f"hostnames({len(hostnames)}) from {hostlist}") return hostnames From a3016a543a548abf368d463271239e78d125f5f5 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 20 Jul 2024 00:40:33 +0000 Subject: [PATCH 038/118] Limit `pytest` search space --- .pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/.pytest.ini b/.pytest.ini index 7c21b17567..fe08a0c2b6 100644 --- a/.pytest.ini +++ b/.pytest.ini @@ -14,3 +14,4 @@ [pytest] filterwarnings = ignore::DeprecationWarning +testpaths = tests community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests From b571cf767b162208e3948aa14beddec4d0184044 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 19 Jul 2024 21:08:52 +0000 Subject: [PATCH 039/118] Compress topology.conf --- .../modules/slurm_files/scripts/conf.py | 20 ++++- .../scripts/tests/test_topology.py | 85 +++++++++---------- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 6f72272bcf..b6b48110f8 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -417,14 +417,14 @@ def render_conf_lines(self) -> Iterable[str]: class TopologyBuilder: def __init__(self) -> None: - self._r = Switch("root") + self._r = Switch("") # fake root, not part of the tree def add(self, path: List[str], nodes: Iterable[str]) -> None: n = self._r assert path for p in path: n = n.switches.setdefault(p, Switch(p)) - n.nodes = chain(n.nodes, nodes) + n.nodes = [*n.nodes, *nodes] def render_conf_lines(self) -> Iterable[str]: if not self._r.switches: @@ -432,6 +432,20 @@ def render_conf_lines(self) -> Iterable[str]: for s in sorted(self._r.switches.values(), key=lambda s: s.name): yield from s.render_conf_lines() + def compress(self) -> "TopologyBuilder": + compressed = TopologyBuilder() + def _walk( + u: Switch, c: Switch + ): # u: uncompressed node, c: its counterpart in compressed tree + pref = f"{c.name}_" if c != compressed._r else "s" + for i, us in enumerate(sorted(u.switches.values(), key=lambda s: s.name)): + cs = Switch(f"{pref}{i}", nodes=us.nodes) + c.switches[cs.name] = cs + _walk(us, cs) + + _walk(self._r, compressed._r) + return compressed + def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup): tpuobj = util.TPU(nodeset) @@ -470,7 +484,7 @@ def gen_topology(lkp: util.Lookup) -> TopologyBuilder: def gen_topology_conf(lkp: util.Lookup) -> None: """generate slurm topology.conf from config.yaml""" - bldr = gen_topology(lkp) + bldr = gen_topology(lkp).compress() conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_topology.conf" with open(conf_file, "w") as f: f.writelines(FILE_PREAMBLE + "\n") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index aedd35745a..9bc7752f35 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -19,38 +19,20 @@ import conf import tempfile -def test_gen_topology_conf_empty(): - cfg = TstCfg(output_dir=tempfile.mkdtemp()) - conf.gen_topology_conf(util.Lookup(cfg)) - assert ( - open(cfg.output_dir + "/cloud_topology.conf").read() - == """ +PRELUDE = """ # Warning: # This file is managed by a script. Manual modifications will be overwritten. - """ - ) + +def test_gen_topology_conf_empty(): + cfg = TstCfg(output_dir=tempfile.mkdtemp()) + conf.gen_topology_conf(util.Lookup(cfg)) + assert open(cfg.output_dir + "/cloud_topology.conf").read() == PRELUDE + "\n" @mock.patch("util.TPU") -@mock.patch( - "util.to_hostnames", - side_effect=make_to_hostnames_mock( - { - "m22-bold-[0-3]": ["m22-bold-0", "m22-bold-1", "m22-bold-2", "m22-bold-3"], - "m22-bold-[4-8]": [ - "m22-bold-4", - "m22-bold-5", - "m22-bold-6", - "m22-bold-7", - "m22-bold-8", - ], - "m22-slim-[0-2]": ["m22-slim-0", "m22-slim-1", "m22-slim-2"], - } - ), -) -def test_gen_topology_conf(to_hostnames_mock, tpu_mock): +def test_gen_topology_conf(tpu_mock): cfg = TstCfg( nodeset_tpu={ "a": TstNodeset("bold", node_count_static=4, node_count_dynamic_max=5), @@ -73,24 +55,37 @@ def tpu_se(ns: TstNodeset) -> TstTPU: tpu_mock.side_effect = tpu_se - conf.gen_topology_conf(util.Lookup(cfg)) - assert ( - open(cfg.output_dir + "/cloud_topology.conf").read() - == """ -# Warning: -# This file is managed by a script. Manual modifications will be overwritten. - -SwitchName=nodeset-root Switches=blue,green,pink -SwitchName=blue Nodes=m22-blue-[0-6] -SwitchName=green Nodes=m22-green-[0-4] -SwitchName=pink Nodes=m22-pink-[0-3] -SwitchName=nodeset_tpu-root Switches=bold,slim -SwitchName=bold Switches=bold-[0-3] -SwitchName=bold-0 Nodes=m22-bold-[0-2] -SwitchName=bold-1 Nodes=m22-bold-3 -SwitchName=bold-2 Nodes=m22-bold-[4-6] -SwitchName=bold-3 Nodes=m22-bold-[7-8] -SwitchName=slim Nodes=m22-slim-[0-2] + lkp = util.Lookup(cfg) + uncompressed = conf.gen_topology(lkp) + want_uncompressed = [ + "SwitchName=nodeset-root Switches=blue,green,pink", + "SwitchName=blue Nodes=m22-blue-[0-6]", + "SwitchName=green Nodes=m22-green-[0-4]", + "SwitchName=pink Nodes=m22-pink-[0-3]", + "SwitchName=nodeset_tpu-root Switches=bold,slim", + "SwitchName=bold Switches=bold-[0-3]", + "SwitchName=bold-0 Nodes=m22-bold-[0-2]", + "SwitchName=bold-1 Nodes=m22-bold-3", + "SwitchName=bold-2 Nodes=m22-bold-[4-6]", + "SwitchName=bold-3 Nodes=m22-bold-[7-8]", + "SwitchName=slim Nodes=m22-slim-[0-2]"] + assert list(uncompressed.render_conf_lines()) == want_uncompressed + + compressed = uncompressed.compress() + want_compressed = [ + "SwitchName=s0 Switches=s0_[0-2]", + "SwitchName=s0_0 Nodes=m22-blue-[0-6]", + "SwitchName=s0_1 Nodes=m22-green-[0-4]", + "SwitchName=s0_2 Nodes=m22-pink-[0-3]", + "SwitchName=s1 Switches=s1_[0-1]", + "SwitchName=s1_0 Switches=s1_0_[0-3]", + "SwitchName=s1_0_0 Nodes=m22-bold-[0-2]", + "SwitchName=s1_0_1 Nodes=m22-bold-3", + "SwitchName=s1_0_2 Nodes=m22-bold-[4-6]", + "SwitchName=s1_0_3 Nodes=m22-bold-[7-8]", + "SwitchName=s1_1 Nodes=m22-slim-[0-2]"] + assert list(compressed.render_conf_lines()) == want_compressed -""" - ) + conf.gen_topology_conf(util.Lookup(cfg)) + want_written = PRELUDE + "\n".join(want_compressed) + "\n\n" + assert open(cfg.output_dir + "/cloud_topology.conf").read() == want_written From 042269902be4dbd43f8dc3f80697ad59afb0b0af Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 20 Jul 2024 06:49:07 +0000 Subject: [PATCH 040/118] Add "pretty" mode to babysitter --- tools/cloud-build/babysit/cli_ui.py | 46 +++++++++++++++++++++++------ tools/cloud-build/babysit/runner.py | 8 +++-- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index f86c5f1c87..06c09129a3 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -14,14 +14,23 @@ from typing import Sequence, Dict, Optional import time +from enum import Enum from .core import Status, Build, latest_by_trigger, trig_name +class Color(Enum): + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + BLUE = "\033[94m" + END = "\033[0m" + class CliUI: # implements UIProto - def __init__(self) -> None: + def __init__(self, pretty=False) -> None: self._status: Dict[str, Status] = {} self._change = False + self._pretty = pretty def on_init(self, builds: Sequence[Build]) -> None: for b in builds: @@ -53,15 +62,34 @@ def sleep(self, sec: int) -> None: time.sleep(sec) def _render_summary(self, builds: Sequence[Build]) -> None: - for _, bc in sorted(latest_by_trigger(builds).items()): + order_fn = lambda bc: (bc.build.status, trig_name(bc.build)) + + ordered = sorted(latest_by_trigger(builds).values(), key=order_fn) + for bc in ordered: print(self._render_build(bc.build, bc.count)) - def _render_build(self, build: Build, count=1) -> str: - if count > 1: - return f"{self._render_status(build.status)}[{count}]\t{trig_name(build)}\t{build.log_url}" - return f"{self._render_status(build.status)}\t{trig_name(build)}\t{build.log_url}" + def _render_build(self, build: Build, count:int=1) -> str: + status = self._render_status(build.status) + cnt = f"[{count}]" if count > 1 else " " + link = self._render_link(build) + return f"{status}{cnt} {link}" def _render_status(self, status: Optional[Status]) -> str: - if status is None: - return "NONE" - return status.name + sn = "NONE" if status is None else status.name + if not self._pretty: return sn + CM = { + Status.SUCCESS: Color.GREEN, + Status.FAILURE: Color.RED, + Status.TIMEOUT: Color.RED, + Status.PENDING: Color.END, # default + Status.QUEUED: Color.BLUE, + Status.WORKING: Color.BLUE, + } + def_color = Color.YELLOW # render "unusual" states with something bright + clr = CM.get(status, def_color).value + return f"{clr}{sn}{Color.END.value}" + + def _render_link(self, build: Build) -> str: + name, url = trig_name(build), build.log_url + if not self._pretty: return f"{name}\t{url}" + return f"\033]8;;{url}\033\\{name}\033]8;;\033\\" diff --git a/tools/cloud-build/babysit/runner.py b/tools/cloud-build/babysit/runner.py index 62189ebe7c..4d8fca373c 100644 --- a/tools/cloud-build/babysit/runner.py +++ b/tools/cloud-build/babysit/runner.py @@ -128,6 +128,10 @@ def run_from_cli(): help="Number of tests to run concurrently, default is 1") parser.add_argument("-r", "--retries", type=int, default=1, help="Number of retries, to disable retries set to 0, default is 1") + # Non-runner args + parser.add_argument("--pretty", action="store_true", help="Render pretty output") - args = RunnerArgs(**vars(parser.parse_args())) - run(args, CliUI()) + cli_args = vars(parser.parse_args()) + ui = CliUI(pretty=cli_args.pop("pretty")) + + run(RunnerArgs(**cli_args), ui) From bfd27c478ccff6b0b76ac657ff8e71a78387afb3 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Sun, 21 Jul 2024 16:54:01 +0000 Subject: [PATCH 041/118] Updating cleanup compute to use a variable so it doesn't break when using a different version of gcloud --- .../schedmd-slurm-gcp-v6-controller/README.md | 1 + .../cleanup.tf | 9 +++++--- .../scripts/cleanup_compute.sh | 21 +++++++++++++------ .../variables.tf | 7 +++++++ 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index de0621b9b1..7089d8420d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -259,6 +259,7 @@ limitations under the License. | [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": "beta"
}
| no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf index 06d9803594..e1fdf74611 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf @@ -32,12 +32,15 @@ resource "null_resource" "cleanup_compute" { count = var.enable_cleanup_compute ? 1 : 0 triggers = { - project_id = var.project_id - cluster_name = local.slurm_cluster_name + project_id = var.project_id + cluster_name = local.slurm_cluster_name + universe_domain = var.universe_domain + compute_endpoint_version = var.endpoint_versions.compute + gcloud_path_override = var.gcloud_path_override } provisioner "local-exec" { - command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name}" + command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" when = destroy } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh index d507785e78..a165479ad4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/scripts/cleanup_compute.sh @@ -17,12 +17,21 @@ set -e -o pipefail project="$1" cluster_name="$2" +universe_domain="$3" +compute_endpoint_version="$4" +gcloud_dir="$5" -if [[ -z "${project}" || -z "${cluster_name}" ]]; then - echo "Usage: $0 " +if [[ -z "${project}" || -z "${cluster_name}" || -z "${universe_domain}" || -z "${compute_endpoint_version}" ]]; then + echo "Usage: $0 " exit 1 fi +if [[ -n "${gcloud_dir}" ]]; then + export PATH="$gcloud_dir:$PATH" +fi + +API_ENDPOINT="CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE=https://www.${universe_domain}/compute/${compute_endpoint_version}/" + if ! type -P gcloud 1>/dev/null; then echo "gcloud is not available and your compute resources are not being cleaned up" echo "https://console.cloud.google.com/compute/instances?project=${project}" @@ -32,25 +41,25 @@ fi echo "Deleting compute nodes" node_filter="labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" while true; do - nodes=$(gcloud compute instances list --project "${project}" --format="value(selfLink)" --filter="${node_filter}" --limit=10 | paste -sd " " -) + nodes=$(bash -c "$API_ENDPOINT gcloud compute instances list --project \"${project}\" --format=\"value(selfLink)\" --filter=\"${node_filter}\" --limit=10 | paste -sd \" \" -") if [[ -z "${nodes}" ]]; then break fi # The lack of quotes is intentional and causes each new space-separated "word" to # be treated as independent arguments. See PR#2523 # shellcheck disable=SC2086 - gcloud compute instances delete --quiet ${nodes} + bash -c "$API_ENDPOINT gcloud compute instances delete --quiet ${nodes}" done echo "Deleting resource policies" policies_filter="name:${cluster_name}-*" while true; do - policies=$(gcloud compute resource-policies list --project "${project}" --format="value(selfLink)" --filter="${policies_filter}" --limit=10 | paste -sd " " -) + policies=$(bash -c "$API_ENDPOINT gcloud compute resource-policies list --project \"${project}\" --format=\"value(selfLink)\" --filter=\"${policies_filter}\" --limit=10 | paste -sd \" \" -") if [[ -z "${policies}" ]]; then break fi # The lack of quotes is intentional and causes each new space-separated "word" to # be treated as independent arguments. See PR#2523 # shellcheck disable=SC2086 - gcloud compute resource-policies delete --quiet ${policies} + bash -c "$API_ENDPOINT gcloud compute resource-policies delete --quiet ${policies}" done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 784ccac5c3..7996061da5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -636,3 +636,10 @@ variable "endpoint_versions" { } nullable = false } + +variable "gcloud_path_override" { + description = "Directory of the gcloud executable to be used during cleanup" + type = string + default = "" + nullable = false +} From 09ff77a741e534e8c67feb8658900ac5b547ccac Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 22 Jul 2024 16:35:26 +0000 Subject: [PATCH 042/118] add more integration test for gke ml nodepool --- .../test-ml-gke-e2e-validation.yml | 54 ++++++++ .../daily-tests/blueprints/ml-gke-e2e.yaml | 115 ++++++++++++++++++ .../daily-tests/builds/ml-gke-e2e.yaml | 62 ++++++++++ .../daily-tests/tests/ml-gke-e2e.yml | 28 +++++ 4 files changed, 259 insertions(+) create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml create mode 100644 tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml create mode 100644 tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml create mode 100644 tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml new file mode 100644 index 0000000000..9f6b9b34d8 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml @@ -0,0 +1,54 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Assert variables are defined + ansible.builtin.assert: + that: + - cli_deployment_vars.region is defined + - custom_vars.project is defined + +- name: Get cluster credentials for kubectl + delegate_to: localhost + ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} + +- name: Execute job on g2 latest driver pool + delegate_to: localhost + ansible.builtin.shell: | + array=({{ workspace }}/{{ deployment_name }}/primary/job-g2-latest-driver*) + kubectl create -f ${array[0]} + args: + executable: /bin/bash + changed_when: False + +- name: Execute job on n1 full spec pool + delegate_to: localhost + ansible.builtin.shell: | + array=({{ workspace }}/{{ deployment_name }}/primary/job-n1-pool-full-spec*) + kubectl create -f ${array[0]} + args: + executable: /bin/bash + changed_when: False + +- name: Wait for jobs to complete + delegate_to: localhost + ansible.builtin.command: | + kubectl get job --field-selector status.successful=1 + register: job_completion + until: job_completion.stdout_lines | length > 2 # 2 jobs total + retries: 40 + delay: 15 + +- name: Print job_completion debug output + ansible.builtin.debug: + var: job_completion.stdout_lines diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml new file mode 100644 index 0000000000..607b977333 --- /dev/null +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -0,0 +1,115 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +blueprint_name: ml-gke-e2e + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: ml-gke-e2e + region: asia-southeast1 + zones: + - asia-southeast1-b # g2 machine has better availability in this zone + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet1 + secondary_ranges: + gke-subnet1: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - display_name: deployment-machine + cidr_block: $(vars.authorized_cidr) + outputs: [instructions] + + - id: g2_latest_driver + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: g2-latest-driver + disk_type: pd-balanced + machine_type: g2-standard-4 + guest_accelerator: + - gpu_driver_installation_config: + - gpu_driver_version: "LATEST" + gpu_sharing_config: + - max_shared_clients_per_gpu: 2 + gpu_sharing_strategy: "MPS" + + - id: job_template_g2_latest_driver + source: modules/compute/gke-job-template + use: [g2_latest_driver] + settings: + name: job-g2-latest-driver + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 + command: + - nvidia-smi + node_count: 1 + node_selectors: [ + { + "key": "cloud.google.com/gke-nodepool", + "value": "g2-latest-driver" + } + ] + outputs: [instructions] + + - id: n1_pool_full_spec + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: n1-pool-full-spec + disk_type: pd-balanced + machine_type: n1-standard-4 + guest_accelerator: + - type: nvidia-tesla-t4 + count: 2 + gpu_driver_installation_config: + - gpu_driver_version: "LATEST" + gpu_sharing_config: + - max_shared_clients_per_gpu: 2 + gpu_sharing_strategy: "TIME_SHARING" + + - id: job_template_n1_pool_full_spec + source: modules/compute/gke-job-template + use: [n1_pool_full_spec] + settings: + name: job-n1-pool-full-spec + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 + command: + - nvidia-smi + node_count: 1 + node_selectors: [ + { + "key": "cloud.google.com/gke-nodepool", + "value": "n1-pool-full-spec" + } + ] + outputs: [instructions] diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml new file mode 100644 index 0000000000..4b04ceb7d0 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml @@ -0,0 +1,62 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.gke-cluster +- m.gke-job-template +- m.gke-node-pool +- m.vpc +- gke + +timeout: 14400s # 4hr + +steps: +- id: gke + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + SG_EXAMPLE=tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml + + # adding vm to act as remote node + echo ' - id: remote-node' >> $${SG_EXAMPLE} + echo ' source: modules/compute/vm-instance' >> $${SG_EXAMPLE} + echo ' use: [network1]' >> $${SG_EXAMPLE} + echo ' settings:' >> $${SG_EXAMPLE} + echo ' machine_type: e2-standard-2' >> $${SG_EXAMPLE} + echo ' zone: asia-southeast1-b' >> $${SG_EXAMPLE} + + echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} + echo ' source: modules/compute/gke-node-pool' >> $${SG_EXAMPLE} + echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} + echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + + # avoids conflict with other tests + sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} + + IP=$(curl ifconfig.me) + sed -i "s//$${IP}/" $${SG_EXAMPLE} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml" diff --git a/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml b/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml new file mode 100644 index 0000000000..6c4a4e0b37 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml @@ -0,0 +1,28 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +test_name: ml-gke-e2e +deployment_name: ml-gke-e2e-{{ build }} +region: asia-southeast1 +zone: asia-southeast1-b # for remote node +workspace: /workspace +blueprint_yaml: "{{ workspace }}/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-0" +cli_deployment_vars: + region: "{{ region }}" +custom_vars: + project: "{{ project }}" +post_deploy_tests: +- test-validation/test-ml-gke-e2e-validation.yml From 88f149cd13334844858600078d2ed5a9d902df0f Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:12:27 +0000 Subject: [PATCH 043/118] Update provider versions --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- pkg/config/staging_test.go | 2 +- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 7a22523945..c6fb9be18d 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -182,11 +182,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.32.0", + Version: ">= 4.84.0, < 5.39.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.32.0", + Version: ">= 4.84.0, < 5.39.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 7b0dae6c95..34025ece06 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 5.32.0"}, + Version: ">= 4.84.0, < 5.39.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 5.32.0"}}) + Version: ">= 4.84.0, < 5.39.0"}}) } { // no def PR, group PR diff --git a/pkg/config/staging_test.go b/pkg/config/staging_test.go index 071ae7f6d2..01ada0e9e9 100644 --- a/pkg/config/staging_test.go +++ b/pkg/config/staging_test.go @@ -41,7 +41,7 @@ func (s *zeroSuite) TestGhpcStageImpl(c *C) { h("zero", "../.ghpc/staged/zero_d02c4c4cde") h("zero/one.txt", "../.ghpc/staged/one.txt_f8669c6c22") h("./../../two.gif", "../.ghpc/staged/two.gif_711b257c4f") - h(".", "../.ghpc/staged/file_5058f1af83") + h(".", "../.ghpc/staged/file_5.39.0af83") h("..", "../.ghpc/staged/file_58b9e70b65") { diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index 6e19532122..efa8f25bfb 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index c54de68b69..792917c317 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index d01535a834..b25ddd135b 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -79,14 +79,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index c54de68b69..792917c317 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index c54de68b69..792917c317 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 6d562428fc..71103dd046 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 5.32.0' + version: '>= 4.84.0, < 5.39.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index c54de68b69..792917c317 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.32.0" + version = ">= 4.84.0, < 5.39.0" } } } From f7ed047a956c859cfc0966b260a8a0ad673041bc Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:23:02 +0000 Subject: [PATCH 044/118] Fix a file updated by mistake --- pkg/config/staging_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/config/staging_test.go b/pkg/config/staging_test.go index 01ada0e9e9..071ae7f6d2 100644 --- a/pkg/config/staging_test.go +++ b/pkg/config/staging_test.go @@ -41,7 +41,7 @@ func (s *zeroSuite) TestGhpcStageImpl(c *C) { h("zero", "../.ghpc/staged/zero_d02c4c4cde") h("zero/one.txt", "../.ghpc/staged/one.txt_f8669c6c22") h("./../../two.gif", "../.ghpc/staged/two.gif_711b257c4f") - h(".", "../.ghpc/staged/file_5.39.0af83") + h(".", "../.ghpc/staged/file_5058f1af83") h("..", "../.ghpc/staged/file_58b9e70b65") { From 6cf7229be9801981a5f33050e7668ab33d0849f9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 23 Jul 2024 01:27:10 +0000 Subject: [PATCH 045/118] Minor. Add test for `dict_to_conf` --- .../modules/slurm_files/scripts/conf.py | 2 +- .../slurm_files/scripts/tests/test_conf.py | 20 ++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 6f72272bcf..5f76e171b2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -36,7 +36,7 @@ def dict_to_conf(conf, delim=" ") -> str: def filter_conf(pair): k, v = pair if isinstance(v, list): - v = ",".join(el for el in v if el is not None) + v = ",".join(str(el) for el in v if el is not None) return k, (v if bool(v) or v == 0 else None) return delim.join( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 66c2175da0..2159732d74 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import pytest from mock import Mock from common import TstNodeset, TstCfg, TstMachineConf, TstTemplateInfo @@ -60,3 +60,21 @@ def test_nodeset_lines(): "NodeSet=turbo Nodes=m22-turbo-[0-4]", ] ) + + +@pytest.mark.parametrize( + "value,want", + [ + ({"a": 1}, "a=1"), + ({"a": "two"}, "a=two"), + ({"a": [3, 4]}, "a=3,4"), + ({"a": ["five", "six"]}, "a=five,six"), + ({"a": None}, ""), + ({"a": ["seven", None, 8]}, "a=seven,8"), + ({"a": 1, "b": "two"}, "a=1 b=two"), + ({"a": 1, "b": None, "c": "three"}, "a=1 c=three"), + ({"a": 0, "b": None, "c": 0.0, "e": ""}, "a=0 c=0.0"), + ({"a": [0, 0.0, None, "X", "", "Y"]}, "a=0,0.0,X,,Y"), + ]) +def test_dict_to_conf(value: dict, want: str): + assert conf.dict_to_conf(value) == want From 1f373b5cb45ad926b2b42af8091b14fde2de294a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 23 Jul 2024 04:59:44 +0000 Subject: [PATCH 046/118] Add prefix "ns_" to nodeset switches to avoid collisions --- .../modules/slurm_files/scripts/conf.py | 4 ++-- .../slurm_files/scripts/tests/test_topology.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index b6b48110f8..85993dc5e1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -451,7 +451,7 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L tpuobj = util.TPU(nodeset) static, dynamic = lkp.nodenames(nodeset) - pref = ["nodeset_tpu-root", nodeset.nodeset_name] + pref = ["tpu-root", f"ns_{nodeset.nodeset_name}"] if tpuobj.vmcount == 1: # Put all nodes in one switch bldr.add(pref, list(chain(static, dynamic))) return @@ -468,7 +468,7 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L def add_nodeset_topology( nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup ) -> None: - path = ["nodeset-root", nodeset.nodeset_name] + path = ["slurm-root", f"ns_{nodeset.nodeset_name}"] nodes = list(chain(*lkp.nodenames(nodeset))) bldr.add(path, nodes) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index 9bc7752f35..3dc86dcd21 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -58,17 +58,17 @@ def tpu_se(ns: TstNodeset) -> TstTPU: lkp = util.Lookup(cfg) uncompressed = conf.gen_topology(lkp) want_uncompressed = [ - "SwitchName=nodeset-root Switches=blue,green,pink", - "SwitchName=blue Nodes=m22-blue-[0-6]", - "SwitchName=green Nodes=m22-green-[0-4]", - "SwitchName=pink Nodes=m22-pink-[0-3]", - "SwitchName=nodeset_tpu-root Switches=bold,slim", - "SwitchName=bold Switches=bold-[0-3]", + "SwitchName=slurm-root Switches=ns_blue,ns_green,ns_pink", + "SwitchName=ns_blue Nodes=m22-blue-[0-6]", + "SwitchName=ns_green Nodes=m22-green-[0-4]", + "SwitchName=ns_pink Nodes=m22-pink-[0-3]", + "SwitchName=tpu-root Switches=ns_bold,ns_slim", + "SwitchName=ns_bold Switches=bold-[0-3]", "SwitchName=bold-0 Nodes=m22-bold-[0-2]", "SwitchName=bold-1 Nodes=m22-bold-3", "SwitchName=bold-2 Nodes=m22-bold-[4-6]", "SwitchName=bold-3 Nodes=m22-bold-[7-8]", - "SwitchName=slim Nodes=m22-slim-[0-2]"] + "SwitchName=ns_slim Nodes=m22-slim-[0-2]"] assert list(uncompressed.render_conf_lines()) == want_uncompressed compressed = uncompressed.compress() From 97ea756c4dbbe9ad6b52f4b3edf40f3a74073210 Mon Sep 17 00:00:00 2001 From: wkharold Date: Tue, 23 Jul 2024 17:34:32 -0500 Subject: [PATCH 047/118] Update hpc-slurm6-apptainer.yaml update the source_image_family --- community/examples/hpc-slurm6-apptainer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index f5660e0167..49c0388ed2 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-4-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-5-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) From b9d806b1c284beae6a7f1dee8a80679db2578251 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jul 2024 00:13:04 +0000 Subject: [PATCH 048/118] Tighdy up Slurm config generation --- .../modules/slurm_files/scripts/conf.py | 36 +++++++++++++------ .../modules/slurm_files/scripts/setup.py | 28 ++------------- .../modules/slurm_files/scripts/slurmsync.py | 20 ++--------- .../modules/slurm_files/scripts/util.py | 4 +++ 4 files changed, 34 insertions(+), 54 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index b6b48110f8..79def5d58d 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -256,7 +256,7 @@ def make_cloud_conf(lkp: util.Lookup) -> str: def gen_cloud_conf(lkp: util.Lookup) -> None: content = make_cloud_conf(lkp) - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud.conf" + conf_file = lkp.etc_dir / "cloud.conf" conf_file.write_text(content) util.chown_slurm(conf_file, mode=0o644) @@ -281,7 +281,7 @@ def install_slurm_conf(lkp: util.Lookup) -> None: conf = lkp.cfg.slurm_conf_tpl.format(**conf_options) - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurm.conf" + conf_file = lkp.etc_dir / "slurm.conf" conf_file.write_text(conf) util.chown_slurm(conf_file, mode=0o644) @@ -319,14 +319,14 @@ def install_slurmdbd_conf(lkp: util.Lookup) -> None: conf = lkp.cfg.slurmdbd_conf_tpl.format(**conf_options) - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "slurmdbd.conf" + conf_file = lkp.etc_dir / "slurmdbd.conf" conf_file.write_text(conf) util.chown_slurm(conf_file, 0o600) def install_cgroup_conf(lkp: util.Lookup) -> None: """install cgroup.conf""" - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cgroup.conf" + conf_file = lkp.etc_dir / "cgroup.conf" conf_file.write_text(lkp.cfg.cgroup_conf_tpl) util.chown_slurm(conf_file, mode=0o600) @@ -343,7 +343,7 @@ def install_jobsubmit_lua(lkp: util.Lookup) -> None: } conf = lkp.cfg.jobsubmit_lua_tpl.format(**conf_options) - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "job_submit.lua" + conf_file = lkp.etc_dir / "job_submit.lua" conf_file.write_text(conf) util.chown_slurm(conf_file, 0o600) @@ -372,14 +372,14 @@ def gen_cloud_gres_conf(lkp: util.Lookup) -> None: lines.append("\n") content = FILE_PREAMBLE + "\n".join(lines) - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" + conf_file = lkp.etc_dir / "cloud_gres.conf" conf_file.write_text(content) util.chown_slurm(conf_file, mode=0o600) def install_gres_conf(lkp: util.Lookup) -> None: - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_gres.conf" - gres_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "gres.conf" + conf_file = lkp.etc_dir / "cloud_gres.conf" + gres_conf = lkp.etc_dir / "gres.conf" if not gres_conf.exists(): gres_conf.symlink_to(conf_file) util.chown_slurm(gres_conf, mode=0o600) @@ -485,7 +485,8 @@ def gen_topology(lkp: util.Lookup) -> TopologyBuilder: def gen_topology_conf(lkp: util.Lookup) -> None: """generate slurm topology.conf from config.yaml""" bldr = gen_topology(lkp).compress() - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_topology.conf" + conf_file = lkp.etc_dir / "cloud_topology.conf" + with open(conf_file, "w") as f: f.writelines(FILE_PREAMBLE + "\n") for line in bldr.render_conf_lines(): @@ -496,8 +497,21 @@ def gen_topology_conf(lkp: util.Lookup) -> None: def install_topology_conf(lkp: util.Lookup) -> None: - conf_file = Path(lkp.cfg.output_dir or slurmdirs.etc) / "cloud_topology.conf" - topo_conf = Path(lkp.cfg.output_dir or slurmdirs.etc) / "topology.conf" + conf_file = lkp.etc_dir / "cloud_topology.conf" + topo_conf = lkp.etc_dir / "topology.conf" if not topo_conf.exists(): topo_conf.symlink_to(conf_file) util.chown_slurm(conf_file, mode=0o600) + + +def gen_controller_configs(lkp: util.Lookup) -> None: + install_slurm_conf(lkp) + install_slurmdbd_conf(lkp) + gen_cloud_conf(lkp) + gen_cloud_gres_conf(lkp) + gen_topology_conf(lkp) + + install_gres_conf(lkp) + install_cgroup_conf(lkp) + install_topology_conf(lkp) + install_jobsubmit_lua(lkp) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 92d14bc002..6014da1bdb 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -34,19 +34,8 @@ run, install_custom_scripts, ) +import conf -from conf import ( - install_slurm_conf, - install_slurmdbd_conf, - gen_cloud_conf, - gen_cloud_gres_conf, - gen_topology_conf, - install_gres_conf, - install_cgroup_conf, - install_topology_conf, - install_jobsubmit_lua, - login_nodeset, -) from slurmsync import sync_slurm from setup_network_storage import ( @@ -336,18 +325,7 @@ def setup_controller(args): log.info("Setting up controller") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() - - install_slurm_conf(lkp) - install_slurmdbd_conf(lkp) - - gen_cloud_conf(lkp) - gen_cloud_gres_conf(lkp) - gen_topology_conf(lkp) - install_gres_conf(lkp) - install_cgroup_conf(lkp) - install_topology_conf(lkp) - install_jobsubmit_lua(lkp) - + conf.gen_controller_configs(lkp) setup_jwt_key() setup_munge_key() setup_sudoers() @@ -412,7 +390,7 @@ def setup_login(args): slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" slurmd_options = [ f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', - f'--conf="Feature={login_nodeset}"', + f'--conf="Feature={conf.login_nodeset}"', "-Z", ] sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'""" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 15252ff86b..8358c8af24 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -48,16 +48,7 @@ from util import lkp, cfg, CONFIG_FILE from suspend import delete_instances from resume import start_tpu -from conf import ( - gen_cloud_conf, - gen_cloud_gres_conf, - gen_topology_conf, - install_slurm_conf, - install_slurmdbd_conf, - install_gres_conf, - install_cgroup_conf, - install_topology_conf, -) +import conf filename = Path(__file__).name LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") @@ -478,14 +469,7 @@ def reconfigure_slurm(): lkp = Lookup(cfg_new) util.lkp = lkp if lkp.instance_role_safe == "controller": - install_slurm_conf(lkp) - install_slurmdbd_conf(lkp) - gen_cloud_conf(lkp) - gen_cloud_gres_conf(lkp) - gen_topology_conf(lkp) - install_gres_conf(lkp) - install_cgroup_conf(lkp) - install_topology_conf(lkp) + conf.gen_controller_configs(lkp) log.info("Restarting slurmctld to make changes take effect.") try: run("sudo systemctl restart slurmctld.service", check=False) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d1d4753944..d9877a5e6d 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1890,6 +1890,10 @@ def nodeset_map(self, hostnames: list): nodeset_map[self.node_nodeset_name(node)].append(node) return nodeset_map + @property + def etc_dir(self) -> Path: + return Path(self.cfg.output_dir or slurmdirs.etc) + # Define late globals lkp = Lookup() From d75c24a130e5d1a2d5c6890a231d051c4d0e9b0d Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 26 Jul 2024 00:13:16 +0000 Subject: [PATCH 049/118] Remove `hyperdisk-extreme` from list of allowed boot disk_type --- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../variables_controller_instance.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/README.md | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 4b57cd5dfa..d948cbefdf 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -95,7 +95,7 @@ modules. For support with the underlying modules, see the instructions in the | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 5021d57d14..1e23f2c815 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -106,7 +106,7 @@ variable "tags" { } variable "disk_type" { - description = "Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme." type = string default = "pd-standard" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index cdfc2f4e88..a80ed5343e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -164,7 +164,7 @@ No modules. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 15b601727c..dcf1a4d9fa 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -118,7 +118,7 @@ variable "tags" { } variable "disk_type" { - description = "Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme." type = string default = "pd-standard" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index f22e4ea0d9..7975903f81 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -243,7 +243,7 @@ limitations under the License. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes and controller will be destroyed. | `bool` | `true` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index a0a477113d..72135de09f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -14,7 +14,7 @@ variable "disk_type" { type = string - description = "Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme." default = "pd-ssd" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 1ffeadcb70..e1f1cfa233 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -93,7 +93,7 @@ No modules. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_login\_public\_ips](#input\_enable\_login\_public\_ips) | If set to true. The login node will have a random public IP assigned to it. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 2bb051d5a8..41a130b955 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -49,7 +49,7 @@ variable "num_instances" { variable "disk_type" { type = string - description = "Boot disk type, can be either hyperdisk-balanced, hyperdisk-extreme, pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme." default = "pd-ssd" } From 8147a0ea76b0955f406f71d67dc1dabe6969bc8e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 26 Jul 2024 05:46:47 +0000 Subject: [PATCH 050/118] Fix construction of `cloud.conf` --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/conf.py | 37 +++++++--- .../slurm_files/scripts/tests/common.py | 13 ++++ .../slurm_files/scripts/tests/test_conf.py | 71 +++++++++++++++++++ .../variables.tf | 2 +- 5 files changed, 112 insertions(+), 13 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 7975903f81..d32022a0eb 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -229,7 +229,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number, 128)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 04c2fa7c2c..1b66c4ee08 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -44,9 +44,21 @@ def filter_conf(pair): ) -def conflines(cloud_parameters, lkp: util.Lookup) -> str: - scripts_dir = lkp.cfg.install_dir or dirs.scripts - no_comma_params = cloud_parameters.no_comma_params or False +def conflines(lkp: util.Lookup) -> str: + params = lkp.cfg.cloud_parameters + def get(key, default): + """ + Returns the value of the key in params if it exists and is not None, + otherwise returns supplied default. + We can't rely on the `dict.get` method because the value could be `None` as + well as empty NSDict, depending on type of the `cfg.cloud_parameters`. + TODO: Simplify once NSDict is removed from the codebase. + """ + if key not in params or params[key] is None: + return default + return params[key] + + no_comma_params = get("no_comma_params", False) any_gpus = any( lkp.template_info(nodeset.instance_template).gpu_count > 0 @@ -82,9 +94,12 @@ def conflines(cloud_parameters, lkp: util.Lookup) -> str: "gpu" if any_gpus else None, ], } + + scripts_dir = lkp.cfg.install_dir or dirs.scripts prolog_path = Path(dirs.custom_scripts / "prolog.d") epilog_path = Path(dirs.custom_scripts / "epilog.d") - default_tree_width = 65533 if any_dynamic else None + default_tree_width = 65533 if any_dynamic else 128 + conf_options = { **(comma_params if not no_comma_params else {}), "Prolog": f"{prolog_path}/*" if lkp.cfg.prolog_scripts else None, @@ -92,13 +107,13 @@ def conflines(cloud_parameters, lkp: util.Lookup) -> str: "SuspendProgram": f"{scripts_dir}/suspend.py", "ResumeProgram": f"{scripts_dir}/resume.py", "ResumeFailProgram": f"{scripts_dir}/suspend.py", - "ResumeRate": cloud_parameters.get("resume_rate", 0), - "ResumeTimeout": cloud_parameters.get("resume_timeout", 300), - "SuspendRate": cloud_parameters.get("suspend_rate", 0), - "SuspendTimeout": cloud_parameters.get("suspend_timeout", 300), - "TreeWidth": cloud_parameters.get("tree_width", default_tree_width), + "ResumeRate": get("resume_rate", 0), + "ResumeTimeout": get("resume_timeout", 300), + "SuspendRate": get("suspend_rate", 0), + "SuspendTimeout": get("suspend_timeout", 300), + "TreeWidth": get("tree_width", default_tree_width), "JobSubmitPlugins": "lua" if any_tpu else None, - "TopologyPlugin": cloud_parameters.get("topology_plugin", "topology/tree"), + "TopologyPlugin": get("topology_plugin", "topology/tree"), } return dict_to_conf(conf_options, delim="\n") @@ -242,7 +257,7 @@ def make_cloud_conf(lkp: util.Lookup) -> str: """generate cloud.conf snippet""" lines = [ FILE_PREAMBLE, - conflines(lkp.cfg.cloud_parameters, lkp), + conflines(lkp), loginlines(), *(nodeset_lines(n, lkp) for n in lkp.cfg.nodeset.values()), *(nodeset_dyn_lines(n) for n in lkp.cfg.nodeset_dyn.values()), diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py index c96bb97bd8..e95e436b0c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -22,6 +22,11 @@ # TODO: use "real" classes once they are defined (instead of NSDict) + +@dataclass +class Placeholder: + pass + @dataclass class TstNodeset: nodeset_name: str @@ -33,10 +38,18 @@ class TstNodeset: @dataclass class TstCfg: slurm_cluster_name: str = "m22" + cloud_parameters: dict[str, Any] = field(default_factory=dict) + + partitions: dict[str, Placeholder] = field(default_factory=dict) nodeset: dict[str, TstNodeset] = field(default_factory=dict) nodeset_tpu: dict[str, TstNodeset] = field(default_factory=dict) + + install_dir: Optional[str] = None output_dir: Optional[str] = None + prolog_scripts: Optional[list[Placeholder]] = field(default_factory=list) + epilog_scripts: Optional[list[Placeholder]] = field(default_factory=list) + @dataclass class TstTPU: # to prevent client initialization durint "TPU.__init__" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 2159732d74..56a94ba187 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -16,6 +16,7 @@ from mock import Mock from common import TstNodeset, TstCfg, TstMachineConf, TstTemplateInfo +import addict import conf import util @@ -78,3 +79,73 @@ def test_nodeset_lines(): ]) def test_dict_to_conf(value: dict, want: str): assert conf.dict_to_conf(value) == want + + + +@pytest.mark.parametrize( + "cfg,want", + [ + (TstCfg( + install_dir="ukulele", + ), + """PrivateData=cloud +LaunchParameters=enable_nss_slurm,use_interactive_step +SlurmctldParameters=cloud_dns,enable_configless,idle_on_node_suspend +SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation +SuspendProgram=ukulele/suspend.py +ResumeProgram=ukulele/resume.py +ResumeFailProgram=ukulele/suspend.py +ResumeRate=0 +ResumeTimeout=300 +SuspendRate=0 +SuspendTimeout=300 +TreeWidth=128 +TopologyPlugin=topology/tree"""), + (TstCfg( + install_dir="ukulele", + cloud_parameters={ + "no_comma_params": True, + "resume_rate": None, + "resume_timeout": None, + "suspend_rate": None, + "suspend_timeout": None, + "topology_plugin": None, + "tree_width": None, + }, + ), + """SuspendProgram=ukulele/suspend.py +ResumeProgram=ukulele/resume.py +ResumeFailProgram=ukulele/suspend.py +ResumeRate=0 +ResumeTimeout=300 +SuspendRate=0 +SuspendTimeout=300 +TreeWidth=128 +TopologyPlugin=topology/tree"""), + (TstCfg( + install_dir="ukulele", + cloud_parameters={ + "no_comma_params": True, + "resume_rate": 1, + "resume_timeout": 2, + "suspend_rate": 3, + "suspend_timeout": 4, + "topology_plugin": "guess", + "tree_width": 5, + }, + ), + """SuspendProgram=ukulele/suspend.py +ResumeProgram=ukulele/resume.py +ResumeFailProgram=ukulele/suspend.py +ResumeRate=1 +ResumeTimeout=2 +SuspendRate=3 +SuspendTimeout=4 +TreeWidth=5 +TopologyPlugin=guess"""), + ]) +def test_conflines(cfg, want): + assert conf.conflines(util.Lookup(cfg)) == want + + cfg.cloud_parameters = addict.Dict(cfg.cloud_parameters) + assert conf.conflines(util.Lookup(cfg)) == want diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 7996061da5..e8e3f6fe5e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -409,7 +409,7 @@ variable "cloud_parameters" { suspend_rate = optional(number) suspend_timeout = optional(number) topology_plugin = optional(string) - tree_width = optional(number, 128) + tree_width = optional(number) }) default = {} } From fa828eaea1095b82e21253995910e0e453248b40 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 16 Jul 2024 21:23:05 +0000 Subject: [PATCH 051/118] Deprecate `enable_devel` --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/README.md | 1 - .../modules/slurm_files/main.tf | 6 +-- .../modules/slurm_files/variables.tf | 6 --- .../slurm_files.tf | 1 - .../variables.tf | 38 +++++++++++-------- 6 files changed, 24 insertions(+), 30 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index d32022a0eb..08715bfc9e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -250,7 +250,7 @@ limitations under the License. | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | | [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. | `bool` | `true` | no | +| [enable\_devel](#input\_enable\_devel) | DEPRECATED: `enable_devel` is always on. | `bool` | `null` | no | | [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md | `bool` | `null` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 7599f28af9..b1ebc774ec 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -71,7 +71,6 @@ No modules. | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. Not for production use. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/v5/tools/prologs-epilogs/README.md | `bool` | `false` | no | | [enable\_hybrid](#input\_enable\_hybrid) | Enables use of hybrid controller mode. When true, controller\_hybrid\_config will
be used instead of controller\_instance\_config and will disable login instances. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 8963baab17..5ed1c9b5d0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -145,8 +145,6 @@ locals { } data "archive_file" "slurm_gcp_devel_zip" { - count = var.enable_devel ? 1 : 0 - output_path = "${local.build_dir}/${local.slurm_gcp_devel_zip}" type = "zip" source_dir = local.scripts_dir @@ -163,11 +161,9 @@ data "archive_file" "slurm_gcp_devel_zip" { } resource "google_storage_bucket_object" "devel" { - count = var.enable_devel ? 1 : 0 - bucket = var.bucket_name name = local.slurm_gcp_devel_zip_bucket - source = data.archive_file.slurm_gcp_devel_zip[0].output_path + source = data.archive_file.slurm_gcp_devel_zip.output_path } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 9872f8f5d6..f0a99f078d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -27,12 +27,6 @@ variable "bucket_dir" { default = null } -variable "enable_devel" { - type = bool - description = "Enables development mode. Not for production use." - default = false -} - variable "enable_debug_logging" { type = bool description = "Enables debug logging mode. Not for production use." diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index b13525abea..b3a8e27987 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -154,7 +154,6 @@ module "slurm_files" { login_startup_scripts = local.ghpc_startup_script_login login_startup_scripts_timeout = var.login_startup_scripts_timeout - enable_devel = var.enable_devel enable_debug_logging = var.enable_debug_logging extra_logging_flags = var.extra_logging_flags diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index e8e3f6fe5e..546972e30c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -360,12 +360,6 @@ EOD # SLURM # ######### -variable "enable_devel" { - type = bool - description = "Enables development mode." - default = true -} - variable "enable_debug_logging" { type = bool description = "Enables debug logging mode." @@ -429,16 +423,6 @@ variable "enable_default_mounts" { default = true } -variable "disable_default_mounts" { # tflint-ignore: terraform_unused_declarations - description = "DEPRECATED: Use `enable_default_mounts` instead." - type = bool - default = null - validation { - condition = var.disable_default_mounts == null - error_message = "DEPRECATED: Use `enable_default_mounts` instead." - } -} - variable "network_storage" { description = "An array of network attached storage mounts to be configured on all instances." type = list(object({ @@ -643,3 +627,25 @@ variable "gcloud_path_override" { default = "" nullable = false } + +# DEPRECATED VARIABLES + +variable "enable_devel" { # tflint-ignore: terraform_unused_declarations + description = "DEPRECATED: `enable_devel` is always on." + type = bool + default = null + validation { + condition = var.enable_devel == null + error_message = "DEPRECATED: It is always on, remove `enable_devel` variable." + } +} + +variable "disable_default_mounts" { # tflint-ignore: terraform_unused_declarations + description = "DEPRECATED: Use `enable_default_mounts` instead." + type = bool + default = null + validation { + condition = var.disable_default_mounts == null + error_message = "DEPRECATED: Use `enable_default_mounts` instead." + } +} From c27418e281e49de97b8ad4595b3b57489d24106e Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 26 Jul 2024 18:05:31 +0000 Subject: [PATCH 052/118] add test for gke n2 pool with default driver --- .../test-ml-gke-e2e-validation.yml | 14 +++++++++- .../daily-tests/blueprints/ml-gke-e2e.yaml | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml index 9f6b9b34d8..99001a9358 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml @@ -22,6 +22,7 @@ delegate_to: localhost ansible.builtin.command: gcloud container clusters get-credentials {{ deployment_name }} --region {{ cli_deployment_vars.region }} --project {{ custom_vars.project }} +# JOB 1 - name: Execute job on g2 latest driver pool delegate_to: localhost ansible.builtin.shell: | @@ -31,6 +32,17 @@ executable: /bin/bash changed_when: False +# JOB 2 +- name: Execute job on n1 default pool + delegate_to: localhost + ansible.builtin.shell: | + array=({{ workspace }}/{{ deployment_name }}/primary/job-n1-pool-default*) + kubectl create -f ${array[0]} + args: + executable: /bin/bash + changed_when: False + +# JOB 3 - name: Execute job on n1 full spec pool delegate_to: localhost ansible.builtin.shell: | @@ -45,7 +57,7 @@ ansible.builtin.command: | kubectl get job --field-selector status.successful=1 register: job_completion - until: job_completion.stdout_lines | length > 2 # 2 jobs total + until: job_completion.stdout_lines | length > 3 # 3 jobs total retries: 40 delay: 15 diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index 607b977333..d8889b1f1b 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -81,6 +81,34 @@ deployment_groups: ] outputs: [instructions] + - id: n1_pool_default + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: n1-pool-default + disk_type: pd-balanced + machine_type: n1-standard-4 + guest_accelerator: + - type: nvidia-tesla-t4 + count: 2 + + - id: job_template_n1_pool_default + source: modules/compute/gke-job-template + use: [n1_pool_default] + settings: + name: job-n1-pool-default + image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 + command: + - nvidia-smi + node_count: 1 + node_selectors: [ + { + "key": "cloud.google.com/gke-nodepool", + "value": "n1-pool-default" + } + ] + outputs: [instructions] + - id: n1_pool_full_spec source: modules/compute/gke-node-pool use: [gke_cluster] From 15babfc7c97bf5577ecabf53028ed7ce6f8b788b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 24 Jul 2024 01:34:29 +0000 Subject: [PATCH 053/118] Update `topology.conf` during `slurmsync` This is naive implementation that always run `reconfigure` if any changes detected --- .../modules/slurm_files/scripts/conf.py | 33 +++++++++---- .../modules/slurm_files/scripts/setup.py | 2 +- .../scripts/setup_network_storage.py | 4 +- .../modules/slurm_files/scripts/slurmsync.py | 46 +++++++++++-------- .../modules/slurm_files/scripts/util.py | 14 ++++-- 5 files changed, 66 insertions(+), 33 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 85041e8859..eaa16abc23 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -44,6 +44,17 @@ def filter_conf(pair): ) +TOPOLOGY_PLUGIN_TREE = "topology/tree" + +def topology_plugin(lkp: util.Lookup) -> str: + """ + Returns configured topology plugin, defaults to `topology/tree`. + """ + cp, key = lkp.cfg.cloud_parameters, "topology_plugin" + if key not in cp or cp[key] is None: + return TOPOLOGY_PLUGIN_TREE + return cp[key] + def conflines(lkp: util.Lookup) -> str: params = lkp.cfg.cloud_parameters def get(key, default): @@ -113,7 +124,7 @@ def get(key, default): "SuspendTimeout": get("suspend_timeout", 300), "TreeWidth": get("tree_width", default_tree_width), "JobSubmitPlugins": "lua" if any_tpu else None, - "TopologyPlugin": get("topology_plugin", "topology/tree"), + "TopologyPlugin": topology_plugin(lkp), } return dict_to_conf(conf_options, delim="\n") @@ -497,10 +508,14 @@ def gen_topology(lkp: util.Lookup) -> TopologyBuilder: return bldr -def gen_topology_conf(lkp: util.Lookup) -> None: - """generate slurm topology.conf from config.yaml""" +def gen_topology_conf(lkp: util.Lookup) -> bool: + """ + Generates slurm topology.conf. + Returns whether the topology.conf got updated. + """ bldr = gen_topology(lkp).compress() conf_file = lkp.etc_dir / "cloud_topology.conf" + old_hash = util.hash_file(conf_file) if conf_file.exists() else "" with open(conf_file, "w") as f: f.writelines(FILE_PREAMBLE + "\n") @@ -508,8 +523,9 @@ def gen_topology_conf(lkp: util.Lookup) -> None: f.write(line) f.write("\n") f.write("\n") - util.chown_slurm(conf_file, mode=0o600) + new_hash = util.hash_file(conf_file) + return old_hash != new_hash def install_topology_conf(lkp: util.Lookup) -> None: conf_file = lkp.etc_dir / "cloud_topology.conf" @@ -524,9 +540,10 @@ def gen_controller_configs(lkp: util.Lookup) -> None: install_slurmdbd_conf(lkp) gen_cloud_conf(lkp) gen_cloud_gres_conf(lkp) - gen_topology_conf(lkp) - install_gres_conf(lkp) - install_cgroup_conf(lkp) - install_topology_conf(lkp) + install_cgroup_conf(lkp) install_jobsubmit_lua(lkp) + + if topology_plugin(lkp) == TOPOLOGY_PLUGIN_TREE: + gen_topology_conf(lkp) + install_topology_conf(lkp) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 6014da1bdb..cba9fe928e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -129,7 +129,7 @@ def failed_motd(): def run_custom_scripts(): """run custom scripts based on instance_role""" custom_dir = dirs.custom_scripts - if lkp.instance_role == "controller": + if lkp.is_controller: # controller has all scripts, but only runs controller.d custom_dirs = [custom_dir / "controller.d"] elif lkp.instance_role == "compute": diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index cb7d93e9d4..a99203e022 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -99,7 +99,7 @@ def setup_network_storage(log): all_mounts = resolve_network_storage() ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) - if lkp.instance_role == "controller": + if lkp.is_controller: mounts = ext_mounts else: mounts = ext_mounts + int_mounts @@ -192,7 +192,7 @@ def mount_path(path): def munge_mount_handler(log): if not cfg.munge_mount: log.error("Missing munge_mount in cfg") - elif lkp.instance_role == "controller": + elif lkp.is_controller: return mount = cfg.munge_mount diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 8358c8af24..993156cd7b 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -362,9 +362,6 @@ def sync_placement_groups(): ] ) - if lkp.instance_role_safe != "controller": - return - keep_jobs = { str(job["job_id"]) for job in json.loads(run(f"{lkp.scontrol} show jobs --json").stdout)["jobs"] @@ -402,9 +399,6 @@ def sync_placement_groups(): def sync_slurm(): - if lkp.instance_role_safe != "controller": - return - compute_instances = [ name for name, inst in lkp.instances().items() if inst.role == "compute" ] @@ -468,14 +462,15 @@ def reconfigure_slurm(): cfg_new = load_config_file(CONFIG_FILE) lkp = Lookup(cfg_new) util.lkp = lkp - if lkp.instance_role_safe == "controller": + if lkp.is_controller: conf.gen_controller_configs(lkp) log.info("Restarting slurmctld to make changes take effect.") try: + # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well run("sudo systemctl restart slurmctld.service", check=False) - run(f"{lkp.scontrol} reconfigure", timeout=30) - except Exception as e: - log.error(e) + util.scontrol_reconfigure(lkp) + except Exception: + log.exception("failed to reconfigure slurmctld") util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") elif lkp.instance_role_safe in ["compute", "login"]: @@ -485,21 +480,33 @@ def reconfigure_slurm(): log.debug("Done.") +def update_topology(lkp: util.Lookup) -> None: + if conf.topology_plugin(lkp) != conf.TOPOLOGY_PLUGIN_TREE: + return + updated = conf.gen_topology_conf(lkp) + if updated: + log.debug("Topology configuration updated. Reconfiguring Slurm.") + util.scontrol_reconfigure(lkp) + def main(): try: reconfigure_slurm() except Exception: log.exception("failed to reconfigure slurm") - try: - sync_slurm() - except Exception: - log.exception("failed to sync instances") - - try: - sync_placement_groups() - except Exception: - log.exception("failed to sync placement groups") + if lkp.is_controller: + try: + sync_slurm() + except Exception: + log.exception("failed to sync instances") + try: + sync_placement_groups() + except Exception: + log.exception("failed to sync placement groups") + try: + update_topology(lkp) + except Exception: + log.exception("failed to update topology") try: install_custom_scripts(check_hash=True) @@ -507,6 +514,7 @@ def main(): log.exception("failed to sync custom scripts") + if __name__ == "__main__": parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d9877a5e6d..31be50a14d 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -301,7 +301,7 @@ def blob_list(prefix="", delimiter=None, project=None): return [blob for blob in blobs] -def _hash_file(fullpath): +def hash_file(fullpath: Path) -> str: with open(fullpath, "rb") as f: file_hash = hashlib.md5() chunk = f.read(8192) @@ -352,7 +352,9 @@ def install_custom_scripts(check_hash=False): chown_slurm(dirs.custom_scripts / par) need_update = True if check_hash and fullpath.exists(): - need_update = _hash_file(fullpath) != blob.md5_hash + # TODO: MD5 reported by gcloud may differ from the one calculated here (e.g. if blob got gzipped), + # consider using gCRC32C + need_update = hash_file(fullpath) != blob.md5_hash if need_update: log.info(f"installing custom script: {path} from {blob.name}") with fullpath.open("wb") as f: @@ -840,7 +842,6 @@ def atoi(text): return [atoi(w) for w in re.split(r"(\d+)", text)] - # TODO: replace with to_hostlist_fast def to_hostlist(nodenames) -> str: """make hostlist from list of node names""" @@ -1490,6 +1491,10 @@ def instance_role_safe(self): role = None return role + @property + def is_controller(self): + return self.instance_role_safe == "controller" + @cached_property def compute(self): # TODO evaluate when we need to use google_app_cred_path @@ -1894,6 +1899,9 @@ def nodeset_map(self, hostnames: list): def etc_dir(self) -> Path: return Path(self.cfg.output_dir or slurmdirs.etc) +def scontrol_reconfigure(lkp: Lookup) -> None: + log.info("Running scontrol reconfigure") + run(f"{lkp.scontrol} reconfigure", timeout=30) # Define late globals lkp = Lookup() From 25182bd98a73306d7b35083852a1560995b98fe1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:37:15 +0000 Subject: [PATCH 054/118] Bump github.com/zclconf/go-cty from 1.14.4 to 1.15.0 Bumps [github.com/zclconf/go-cty](https://github.com/zclconf/go-cty) from 1.14.4 to 1.15.0. - [Release notes](https://github.com/zclconf/go-cty/releases) - [Changelog](https://github.com/zclconf/go-cty/blob/main/CHANGELOG.md) - [Commits](https://github.com/zclconf/go-cty/compare/v1.14.4...v1.15.0) --- updated-dependencies: - dependency-name: github.com/zclconf/go-cty dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 735db691a7..914b605f15 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 - github.com/zclconf/go-cty v1.14.4 + github.com/zclconf/go-cty v1.15.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c diff --git a/go.sum b/go.sum index 78bd6dffb1..872b1b9bec 100644 --- a/go.sum +++ b/go.sum @@ -496,8 +496,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zclconf/go-cty v1.14.4 h1:uXXczd9QDGsgu0i/QFR/hzI5NYCHLf6NQw/atrbnhq8= -github.com/zclconf/go-cty v1.14.4/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.15.0 h1:tTCRWxsexYUmtt/wVxgDClUe+uQusuI443uL6e+5sXQ= +github.com/zclconf/go-cty v1.15.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= From 2ef0208cce5f2ae8b10ccb51148058546c8054e0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 26 Jul 2024 22:59:48 +0000 Subject: [PATCH 055/118] Fix computing md5 for devel tarball --- .../schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 5ed1c9b5d0..ff4d6899d1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -261,7 +261,7 @@ data "local_file" "setup_external" { locals { checksum = md5(join("", flatten([ google_storage_bucket_object.config.md5hash, - [for f in google_storage_bucket_object.devel : f.md5hash], + google_storage_bucket_object.devel.md5hash, [for k, f in google_storage_bucket_object.controller_startup_scripts : f.md5hash], [for k, f in google_storage_bucket_object.compute_startup_scripts : f.md5hash], [for k, f in google_storage_bucket_object.nodeset_startup_scripts : f.md5hash], From daf74d0cdbd7faa285b4a472d8502e4101deb1d0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 27 Jul 2024 00:03:54 +0000 Subject: [PATCH 056/118] SlurmGCP. Fix broken `--trace-api` flag. --- .../scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/README.md | 2 +- .../modules/slurm_files/scripts/util.py | 3 +-- .../modules/slurm_files/variables.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/variables.tf | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 08715bfc9e..3ef36c189a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -258,7 +258,7 @@ limitations under the License. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": "beta"
}
| no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index b1ebc774ec..5414e24c8b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -76,7 +76,7 @@ No modules. | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | | [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": null
}
| no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | -| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The list of extra flags for the logging system to use. See the logging\_flags variable in scripts/util.py to get the list of supported log flags. | `map(bool)` | `{}` | no | +| [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | | [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 31be50a14d..d4ebe9b7c6 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -511,8 +511,7 @@ def add_log_args_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespac if cfg.enable_debug_logging: args.loglevel = logging.DEBUG if args.trace_api: - cfg.extra_logging_flags = list(cfg.extra_logging_flags) - cfg.extra_logging_flags.append("trace_api") + cfg.extra_logging_flags["trace_api"] = True return args diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index f0a99f078d..84ab6fa056 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -35,7 +35,7 @@ variable "enable_debug_logging" { variable "extra_logging_flags" { type = map(bool) - description = "The list of extra flags for the logging system to use. See the logging_flags variable in scripts/util.py to get the list of supported log flags." + description = "The only available flag is `trace_api`" default = {} } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 546972e30c..2a7cef0009 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -368,7 +368,7 @@ variable "enable_debug_logging" { variable "extra_logging_flags" { type = map(bool) - description = "The list of extra flags for the logging system to use. See the logging_flags variable in scripts/util.py to get the list of supported log flags." + description = "The only available flag is `trace_api`" default = {} } From 44df51665951fc38628a14b8ef0201d40173dd9d Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Sat, 27 Jul 2024 06:45:16 +0000 Subject: [PATCH 057/118] chanve default disk_type for GKE nodepool to null and let GKE decide default value --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 2 +- .../test-ml-gke-e2e-validation.yml | 12 +++++++++- .../daily-tests/blueprints/ml-gke-e2e.yaml | 24 +++++++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index aebe8c7776..2daf69a794 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -227,7 +227,7 @@ No modules. | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | | [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | -| [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 1d71ba8fa2..4cb4bf0af1 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -51,7 +51,7 @@ variable "disk_size_gb" { variable "disk_type" { description = "Disk type for each node." type = string - default = "pd-standard" + default = null } variable "enable_gcfs" { diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml index 99001a9358..7c8f1f9dc4 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-ml-gke-e2e-validation.yml @@ -52,12 +52,22 @@ executable: /bin/bash changed_when: False +# JOB 4 +- name: Execute job on default settings pool + delegate_to: localhost + ansible.builtin.shell: | + array=({{ workspace }}/{{ deployment_name }}/primary/job-default-settings-pool*) + kubectl create -f ${array[0]} + args: + executable: /bin/bash + changed_when: False + - name: Wait for jobs to complete delegate_to: localhost ansible.builtin.command: | kubectl get job --field-selector status.successful=1 register: job_completion - until: job_completion.stdout_lines | length > 3 # 3 jobs total + until: job_completion.stdout_lines | length > 4 # 4 jobs total retries: 40 delay: 15 diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index d8889b1f1b..b5457e1396 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -141,3 +141,27 @@ deployment_groups: } ] outputs: [instructions] + + - id: default_settings_pool + source: modules/compute/gke-node-pool + use: [gke_cluster] + settings: + name: default-settings-pool + + - id: job_default_settings_pool + source: modules/compute/gke-job-template + use: [default_settings_pool] + settings: + name: job-default-settings-pool + image: busybox + command: + - echo + - Hello World + node_count: 1 + node_selectors: [ + { + "key": "cloud.google.com/gke-nodepool", + "value": "default-settings-pool" + } + ] + outputs: [instructions] From 69baa4e627f4d73f94c37c0e6a284056896b6284 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:03:22 +0000 Subject: [PATCH 058/118] Bump google.golang.org/api from 0.188.0 to 0.189.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.188.0 to 0.189.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.188.0...v0.189.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index 914b605f15..4688e10928 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.15.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b // indirect + google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,18 +27,18 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.188.0 + google.golang.org/api v0.189.0 ) require ( - cloud.google.com/go/auth v0.7.0 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect + cloud.google.com/go/auth v0.7.2 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/hashicorp/terraform-json v0.22.1 // indirect @@ -55,12 +55,12 @@ require ( golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade // indirect ) require ( cloud.google.com/go v0.115.0 // indirect - cloud.google.com/go/compute/metadata v0.4.0 // indirect + cloud.google.com/go/compute/metadata v0.5.0 // indirect cloud.google.com/go/iam v1.1.10 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect diff --git a/go.sum b/go.sum index 872b1b9bec..71a7f3f5dd 100644 --- a/go.sum +++ b/go.sum @@ -46,10 +46,10 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.7.0 h1:kf/x9B3WTbBUHkC+1VS8wwwli9TzhSt0vSTVBmMR8Ts= -cloud.google.com/go/auth v0.7.0/go.mod h1:D+WqdrpcjmiCgWrXmLLxOVq1GACoE36chW6KXoEvuIw= -cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= -cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= +cloud.google.com/go/auth v0.7.2 h1:uiha352VrCDMXg+yoBtaD0tUF4Kv9vrtrWPYXwutnDE= +cloud.google.com/go/auth v0.7.2/go.mod h1:VEc4p5NNxycWQTMQEDQF0bd6aTMb6VgYDXEwiJJQAbs= +cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= +cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/automl v1.6.0/go.mod h1:ugf8a6Fx+zP0D59WLhqgTDsQI9w07o64uf/Is3Nh5p8= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= @@ -72,8 +72,8 @@ cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= -cloud.google.com/go/compute/metadata v0.4.0 h1:vHzJCWaM4g8XIcm8kopr3XmDA4Gy/lblD3EhhSux05c= -cloud.google.com/go/compute/metadata v0.4.0/go.mod h1:SIQh1Kkb4ZJ8zJ874fqVkslA29PRXuleyj6vOzlbK7M= +cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= +cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= cloud.google.com/go/containeranalysis v0.5.1/go.mod h1:1D92jd8gRR/c0fGMlymRgxWD3Qw9C1ff6/T7mLgVL8I= cloud.google.com/go/containeranalysis v0.6.0/go.mod h1:HEJoiEIu+lEXM+k7+qLCci0h33lX3ZqoYFdmPcoO7s4= cloud.google.com/go/datacatalog v1.3.0/go.mod h1:g9svFY6tuR+j+hrTw3J2dNcmI0dzmSiyOzm8kpLq0a0= @@ -271,8 +271,8 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-test/deep v1.0.3 h1:ZrJSEWsXzPOxaZnFteGEfooLba+ju3FYIbOrS+rQd68= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.188.0 h1:51y8fJ/b1AaaBRJr4yWm96fPcuxSo0JcegXE3DaHQHw= -google.golang.org/api v0.188.0/go.mod h1:VR0d+2SIiWOYG3r/jdm7adPW9hI2aRv9ETOSCQ9Beag= +google.golang.org/api v0.189.0 h1:equMo30LypAkdkLMBqfeIqtyAnlyig1JSZArl4XPwdI= +google.golang.org/api v0.189.0/go.mod h1:FLWGJKb0hb+pU2j+rJqwbnsF+ym+fQs73rbJ+KAUgy8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -978,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b h1:dSTjko30weBaMj3eERKc0ZVXW4GudCswM3m+P++ukU0= -google.golang.org/genproto v0.0.0-20240708141625-4ad9e859172b/go.mod h1:FfBgJBJg9GcpPvKIuHSZ/aE1g2ecGL74upMzGZjiGEY= +google.golang.org/genproto v0.0.0-20240722135656-d784300faade h1:lKFsS7wpngDgSCeFn7MoLy+wBDQZ1UQIJD4UNM1Qvkg= +google.golang.org/genproto v0.0.0-20240722135656-d784300faade/go.mod h1:FfBgJBJg9GcpPvKIuHSZ/aE1g2ecGL74upMzGZjiGEY= google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 h1:0+ozOGcrp+Y8Aq8TLNN2Aliibms5LEzsq99ZZmAGYm0= google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094/go.mod h1:fJ/e3If/Q67Mj99hin0hMhiNyCRmt6BQ2aWIJshUSJw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b h1:04+jVzTs2XBnOZcPsLnmrTGqltqJbZQ1Ey26hjYdQQ0= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240708141625-4ad9e859172b/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade h1:oCRSWfwGXQsqlVdErcyTt4A93Y8fo0/9D4b1gnI++qo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From ba184ab327bcb0222094d50fea5af0c0a29a88fd Mon Sep 17 00:00:00 2001 From: dgouju Date: Mon, 29 Jul 2024 17:27:12 +0200 Subject: [PATCH 059/118] Move daos agent log to /var/log/daos_agent/ In daos_agent.yml, the log_file setting is by default /tmp/daos_agent.log. This commit: - creates a /var/log/daos_agent directory - sets proper permission - configure log_file in daos_agent.yml to write logs to /var/log/daos_agent/daos_agent.log --- .../file-system/parallelstore/scripts/install-daos-client.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index 84354710d2..ac930a8e24 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -68,6 +68,11 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config +# Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ +mkdir /var/log/daos_agent +chown daos_agent:daos_agent /var/log/daos_agent +sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config + # Start service if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then systemctl start daos_agent.service From 86159278495e126c2b5c5d8fe79cd05b525fc6bd Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 29 Jul 2024 13:10:00 -0500 Subject: [PATCH 060/118] Update a3-highgpu-8g integration test --- .../cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml index d9dc8b9587..76cf6e5102 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml @@ -27,4 +27,4 @@ cli_deployment_vars: region: us-east4 zone: us-east4-a source_image_project_id: deeplearning-platform - source_image: dlvm-tcpd-cu120-20231203-1800-rc0-ubuntu-2004-py310 + source_image: dlvm-tcpd-cu120-648491853-ubuntu-2004-py310 From ae3279d9082d96390bfdffd1056e9eeff557ea82 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 16 Jul 2024 23:08:22 +0000 Subject: [PATCH 061/118] Move get_tpu_vmcount into separate script --- .../slurm_files/etc/job_submit.lua.tpl | 8 +- .../slurm_files/scripts/get_tpu_vmcount.py | 76 +++++++++++++++++++ .../modules/slurm_files/scripts/util.py | 57 -------------- 3 files changed, 80 insertions(+), 61 deletions(-) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl index 5cf8ddb7e9..f3c9b0750e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/etc/job_submit.lua.tpl @@ -1,10 +1,10 @@ SCRIPTS_DIR = "{scripts_dir}" NO_VAL = 4294967294 ---util.py exit code +--get_tpu_vmcount.py exit code PART_INVALID = -1 --partition does not exists in config.yaml, thus do not exist in slurm DIFF_VMCOUNTS_SAME_PART = -2 --in the same partition there are nodesets with different vmcounts DIFF_PART_DIFFERENT_VMCOUNTS = -3 --partition is a list of partitions in which at least two of them have different vmcount -UNKWOWN_ERROR = -4 --util.py did not return a valid response +UNKWOWN_ERROR = -4 --get_tpu_vmcount.py did not return a valid response function get_part(job_desc,part_list) if job_desc.partition then @@ -26,7 +26,7 @@ function os.capture(cmd, raw) end function get_vmcount(part) - local cmd = SCRIPTS_DIR .. "/util.py -p " .. part + local cmd = SCRIPTS_DIR .. "/get_tpu_vmcount.py -p " .. part local out = os.capture(cmd,true) for line in out:gmatch("(.-)\r?\n") do local tag, val = line:match("([^:]+):([^:]+)") @@ -63,7 +63,7 @@ function slurm_job_submit(job_desc, part_list, submit_uid) return slurm.FAILURE end if vmcount == UNKWOWN_ERROR then - slurm.log_user("Something went wrong while executing util.py to get the vmcount.") + slurm.log_user("Something went wrong while executing get_tpu_vmcount.py.") return slurm.ERROR end --This is surely a TPU node diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py new file mode 100644 index 0000000000..08de35fd8a --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Copyright 2024 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import util + + +def get_vmcount_of_tpu_part(part): + res = 0 + for ns in util.lkp.cfg.partitions[part].partition_nodeset_tpu: + tpu_obj = util.TPU(util.lkp.cfg.nodeset_tpu[ns]) + if res == 0: + res = tpu_obj.vmcount + else: + if res != tpu_obj.vmcount: + # this should not happen, that in the same partition there are different vmcount nodesets + return -1 + return res + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "--partitions", + "-p", + help="The partition(s) to retrieve the TPU vmcount value for.", + ) + args = parser.parse_args() + if not args.partitions: + exit(0) + + # useful exit code + # partition does not exists in config.yaml, thus do not exist in slurm + PART_INVALID = -1 + # in the same partition there are nodesets with different vmcounts + DIFF_VMCOUNTS_SAME_PART = -2 + # partition is a list of partitions in which at least two of them have different vmcount + DIFF_PART_DIFFERENT_VMCOUNTS = -3 + vmcounts = [] + # valid equals to 0 means that we are ok, otherwise it will be set to one of the previously defined exit codes + valid = 0 + for part in args.partitions.split(","): + if part not in util.lkp.cfg.partitions: + valid = PART_INVALID + break + else: + if util.part_is_tpu(part): + vmcount = get_vmcount_of_tpu_part(part) + if vmcount == -1: + valid = DIFF_VMCOUNTS_SAME_PART + break + vmcounts.append(vmcount) + else: + vmcounts.append(0) + # this means that there are different vmcounts for these partitions + if valid == 0 and len(set(vmcounts)) != 1: + valid = DIFF_PART_DIFFERENT_VMCOUNTS + if valid != 0: + print(f"VMCOUNT:{valid}") + else: + print(f"VMCOUNT:{vmcounts[0]}") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d4ebe9b7c6..626c052223 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -916,20 +916,6 @@ def part_is_tpu(part): """check if partition with name part contains a nodeset of type tpu""" return len(lkp.cfg.partitions[part].partition_nodeset_tpu) > 0 - -def get_vmcount_of_tpu_part(part): - res = 0 - for ns in lkp.cfg.partitions[part].partition_nodeset_tpu: - tpu_obj = TPU(lkp.cfg.nodeset_tpu[ns]) - if res == 0: - res = tpu_obj.vmcount - else: - if res != tpu_obj.vmcount: - # this should not happen, that in the same partition there are different vmcount nodesets - return -1 - return res - - def to_hostnames(nodelist: str) -> List[str]: """make list of hostnames from hostlist expression""" if not nodelist: @@ -1914,46 +1900,3 @@ def scontrol_reconfigure(lkp: Lookup) -> None: save_config(cfg, CONFIG_FILE) lkp = Lookup(cfg) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument( - "--partitions", - "-p", - help="The partition(s) to retrieve the TPU vmcount value for.", - ) - args = parser.parse_args() - if args.partitions: - # useful exit code - # partition does not exists in config.yaml, thus do not exist in slurm - PART_INVALID = -1 - # in the same partition there are nodesets with different vmcounts - DIFF_VMCOUNTS_SAME_PART = -2 - # partition is a list of partitions in which at least two of them have different vmcount - DIFF_PART_DIFFERENT_VMCOUNTS = -3 - vmcounts = [] - # valid equals to 0 means that we are ok, otherwise it will be set to one of the previously defined exit codes - valid = 0 - for part in args.partitions.split(","): - if part not in lkp.cfg.partitions: - valid = PART_INVALID - break - else: - if part_is_tpu(part): - vmcount = get_vmcount_of_tpu_part(part) - if vmcount == -1: - valid = DIFF_VMCOUNTS_SAME_PART - break - vmcounts.append(vmcount) - else: - vmcounts.append(0) - # this means that there are different vmcounts for these partitions - if valid == 0 and len(set(vmcounts)) != 1: - valid = DIFF_PART_DIFFERENT_VMCOUNTS - if valid != 0: - print(f"VMCOUNT:{valid}") - else: - print(f"VMCOUNT:{vmcounts[0]}") From 20c5e4df655f43ac64b73267bf11b73897a6e9c5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:27:43 +0000 Subject: [PATCH 062/118] Bump grpcio from 1.56.0 to 1.56.2 in /community/front-end/ofe Bumps [grpcio](https://github.com/grpc/grpc) from 1.56.0 to 1.56.2. - [Release notes](https://github.com/grpc/grpc/releases) - [Changelog](https://github.com/grpc/grpc/blob/master/doc/grpc_release_schedule.md) - [Commits](https://github.com/grpc/grpc/compare/v1.56.0...v1.56.2) --- updated-dependencies: - dependency-name: grpcio dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 6f57f7f7fa..ddb12d0e87 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -38,7 +38,7 @@ google-resumable-media==2.5.0 googleapis-common-protos==1.59.1 grafana-api==1.0.3 grpc-google-iam-v1==0.12.6 -grpcio==1.56.0 +grpcio==1.56.2 grpcio-status==1.56.0 h11==0.14.0 httplib2==0.22.0 From e49761384263dec663cddb7766aecb2dd4881851 Mon Sep 17 00:00:00 2001 From: dgouju Date: Tue, 30 Jul 2024 18:10:27 +0200 Subject: [PATCH 063/118] Creates the log dir with -p to make it bullet proof. --- .../file-system/parallelstore/scripts/install-daos-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index ac930a8e24..6fe3da1d41 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -69,7 +69,7 @@ sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config # Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ -mkdir /var/log/daos_agent +mkdir -p /var/log/daos_agent chown daos_agent:daos_agent /var/log/daos_agent sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config From f702db3fc9b04c59153c38897bbc880fab437c11 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Tue, 30 Jul 2024 17:00:16 +0000 Subject: [PATCH 064/118] update gke test for g2-latest-driver pool to use defualt disk_type --- tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index b5457e1396..6e64a667a1 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -55,7 +55,6 @@ deployment_groups: use: [gke_cluster] settings: name: g2-latest-driver - disk_type: pd-balanced machine_type: g2-standard-4 guest_accelerator: - gpu_driver_installation_config: From cc33cfb706352f37c9a6bda3202c56d701832af3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 01:17:44 +0000 Subject: [PATCH 065/118] Babysit. Colorize output * Conditionally colorize output (if terminal supports it); * Add flag `--nocolor`; * Remove flag `--pretty` as link rendering doesn't work almost always. --- tools/cloud-build/babysit/cli_ui.py | 14 +++++++++----- tools/cloud-build/babysit/runner.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index 06c09129a3..608bf3c114 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys from typing import Sequence, Dict, Optional import time from enum import Enum @@ -27,10 +28,10 @@ class Color(Enum): END = "\033[0m" class CliUI: # implements UIProto - def __init__(self, pretty=False) -> None: + def __init__(self, no_color=False) -> None: self._status: Dict[str, Status] = {} self._change = False - self._pretty = pretty + self._no_color = no_color def on_init(self, builds: Sequence[Build]) -> None: for b in builds: @@ -68,6 +69,10 @@ def _render_summary(self, builds: Sequence[Build]) -> None: for bc in ordered: print(self._render_build(bc.build, bc.count)) + def _color(self) -> bool: + if self._no_color: return False + return hasattr(sys.stdout, 'isatty') and sys.stdout.isatty() + def _render_build(self, build: Build, count:int=1) -> str: status = self._render_status(build.status) cnt = f"[{count}]" if count > 1 else " " @@ -76,7 +81,7 @@ def _render_build(self, build: Build, count:int=1) -> str: def _render_status(self, status: Optional[Status]) -> str: sn = "NONE" if status is None else status.name - if not self._pretty: return sn + if not self._color(): return sn CM = { Status.SUCCESS: Color.GREEN, Status.FAILURE: Color.RED, @@ -91,5 +96,4 @@ def _render_status(self, status: Optional[Status]) -> str: def _render_link(self, build: Build) -> str: name, url = trig_name(build), build.log_url - if not self._pretty: return f"{name}\t{url}" - return f"\033]8;;{url}\033\\{name}\033]8;;\033\\" + return f"{name}\t{url}" diff --git a/tools/cloud-build/babysit/runner.py b/tools/cloud-build/babysit/runner.py index 4d8fca373c..3a54a4f26a 100644 --- a/tools/cloud-build/babysit/runner.py +++ b/tools/cloud-build/babysit/runner.py @@ -129,9 +129,9 @@ def run_from_cli(): parser.add_argument("-r", "--retries", type=int, default=1, help="Number of retries, to disable retries set to 0, default is 1") # Non-runner args - parser.add_argument("--pretty", action="store_true", help="Render pretty output") + parser.add_argument("--nocolor", action="store_true", help="Do not use color in output") cli_args = vars(parser.parse_args()) - ui = CliUI(pretty=cli_args.pop("pretty")) + ui = CliUI(no_color=cli_args.pop("nocolor")) run(RunnerArgs(**cli_args), ui) From cb70f708d1c9793f46dfc2f81836855ba7a246c7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 19 Jul 2024 22:09:29 +0000 Subject: [PATCH 066/118] Trim SlurmGCP `instanceProperties` --- .../modules/slurm_files/scripts/resume.py | 50 ++++--------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index f539e4601d..242eb84a23 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -63,53 +63,26 @@ def instance_properties(nodeset, model, placement_group, labels=None): - template = lkp.node_template(model) - template_info = lkp.template_info(template) - props = NSDict() - slurm_metadata = { - "slurm_cluster_name": cfg.slurm_cluster_name, - "slurm_instance_role": "compute", - "startup-script": ( - Path(cfg.slurm_scripts_dir or util.dirs.scripts) / "startup.sh" - ).read_text(), - "VmDnsSetting": "GlobalOnly", - } - info_metadata = { - item.get("key"): item.get("value") for item in template_info.metadata["items"] - } + if labels: # merge in extra labels on instance and disks + template = lkp.node_template(model) + template_info = lkp.template_info(template) - props_metadata = {**info_metadata, **slurm_metadata} - props.metadata = { - "items": [NSDict({"key": k, "value": v}) for k, v in props_metadata.items()] - } - - labels = { - "slurm_cluster_name": cfg.slurm_cluster_name, - "slurm_instance_role": "compute", - **(labels or {}), - } - props.labels = {**template_info.labels, **labels} - - for disk in template_info.disks: - # do not label local ssd - if ( - "diskType" not in disk.initializeParams - or disk.initializeParams.diskType == "local-ssd" - ): - continue - disk.initializeParams.labels.update(labels) - props.disks = template_info.disks + props.labels = {**template_info.labels, **labels} + + for disk in template_info.disks: + if disk.initializeParams.get("diskType", "local-ssd") == "local-ssd": + continue # do not label local ssd + disk.initializeParams.labels.update(labels) + props.disks = template_info.disks if placement_group: props.scheduling = { "onHostMaintenance": "TERMINATE", "automaticRestart": False, } - props.resourcePolicies = [ - placement_group, - ] + props.resourcePolicies = [placement_group] if nodeset.reservation_name: reservation_name = nodeset.reservation_name @@ -154,7 +127,6 @@ def per_instance_properties(node): return props - def create_instances_request(nodes, partition_name, placement_group, job_id=None): """Call regionInstances.bulkInsert to create instances""" assert len(nodes) > 0 From 3c529c8a2e74df535d8df3f8d9b47ed4e1f58465 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 27 Jul 2024 07:13:20 +0000 Subject: [PATCH 067/118] SlurmGCP.setup don't take a `slurmd_feature` as an argument Don't take a `slurmd_feature` as an argument, instead look it up from metadata in python script. Motivation: * simplify invocation by hand; * simplify startup script --- .../modules/slurm_files/scripts/setup.py | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index cba9fe928e..f3b9b871cb 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -320,7 +320,7 @@ def configure_dirs(): scripts_log.symlink_to(dirs.log) -def setup_controller(args): +def setup_controller(): """Run controller setup""" log.info("Setting up controller") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) @@ -382,7 +382,7 @@ def setup_controller(args): pass -def setup_login(args): +def setup_login(): """run login node setup""" log.info("Setting up login") slurmctld_host = f"{lkp.control_host}" @@ -413,7 +413,7 @@ def setup_login(args): log.info("Done setting up login") -def setup_compute(args): +def setup_compute(): """run compute node setup""" log.info("Setting up compute") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) @@ -423,9 +423,17 @@ def setup_compute(args): slurmd_options = [ f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', ] - if args.slurmd_feature is not None: - slurmd_options.append(f'--conf="Feature={args.slurmd_feature}"') + + try: + slurmd_feature = util.instance_metadata("attributes/slurmd_feature") + except Exception: + # TODO: differentiate between unset and error + slurmd_feature = None + + if slurmd_feature is not None: + slurmd_options.append(f'--conf="Feature={slurmd_feature}"') slurmd_options.append("-Z") + sysconf = f"""SLURMD_OPTIONS='{" ".join(slurmd_options)}'""" update_system_config("slurmd", sysconf) install_custom_scripts() @@ -452,21 +460,18 @@ def setup_compute(args): log.info("Done setting up compute") -def main(args): +def main(): start_motd() configure_dirs() # call the setup function for the instance type - setup = dict.get( - { - "controller": setup_controller, - "compute": setup_compute, - "login": setup_login, - }, + { + "controller": setup_controller, + "compute": setup_compute, + "login": setup_login, + }.get( lkp.instance_role, - lambda: log.fatal(f"Unknown node role: {lkp.instance_role}"), - ) - setup(args) + lambda: log.fatal(f"Unknown node role: {lkp.instance_role}"))() end_motd() @@ -477,12 +482,8 @@ def main(args): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) - parser.add_argument( - "--slurmd-feature", - dest="slurmd_feature", - help="Feature for slurmd to register with. Controller ignores this option.", - ) - args = parser.parse_args() + parser.add_argument("--slurmd-feature", dest="slurmd_feature", help="Unused, to be removed.") + _ = util.add_log_args_and_parse(parser) util.config_root_logger(filename, logfile=LOGFILE) sys.excepthook = util.handle_exception @@ -490,7 +491,7 @@ def main(args): lkp = util.Lookup(cfg) # noqa F811 try: - main(args) + main() except subprocess.TimeoutExpired as e: log.error( f"""TimeoutExpired: From 58f7530da476c901632c8262f22d29351887d729 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 06:07:37 +0000 Subject: [PATCH 068/118] Remove `tests` from devel archive --- .../modules/slurm_files/main.tf | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index ff4d6899d1..40a38c5409 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -150,12 +150,9 @@ data "archive_file" "slurm_gcp_devel_zip" { source_dir = local.scripts_dir excludes = flatten([ - "config.yaml", - "Pipfile", - fileset(local.scripts_dir, "__pycache__/*"), - fileset(local.scripts_dir, "*.log"), - fileset(local.scripts_dir, "*.cache"), - fileset(local.scripts_dir, "*.lock"), + fileset(local.scripts_dir, "tests/**"), + # TODO: consider removing (including nested) __pycache__ and all .* files + # Though it only affects developers ]) } From 2eebca8a36ffb843c9d31bf6204704300eae631c Mon Sep 17 00:00:00 2001 From: dgouju Date: Wed, 31 Jul 2024 12:12:57 +0200 Subject: [PATCH 069/118] Move daos agent log to /var/log/daos_agent/ --- .../scripts/install-daos-client.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index 84354710d2..6fe3da1d41 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -68,6 +68,11 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config +# Move agent log destination from /tmp/ (default) to /var/log/daos_agent/ +mkdir -p /var/log/daos_agent +chown daos_agent:daos_agent /var/log/daos_agent +sed -i "s/#.*log_file:.*/log_file: \/var\/log\/daos_agent\/daos_agent.log/g" $daos_config + # Start service if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then systemctl start daos_agent.service From dd518e496a3bf7516ec2ee3d8ac6efd3de1e63c4 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 30 Jul 2024 23:07:51 +0000 Subject: [PATCH 070/118] Babysit. Use short URL if using `hpc-toolkit-dev` --- tools/cloud-build/babysit/cli_ui.py | 10 ++++++++-- tools/cloud-build/babysit/runner.py | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index 608bf3c114..2b16e2c3ca 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -28,10 +28,11 @@ class Color(Enum): END = "\033[0m" class CliUI: # implements UIProto - def __init__(self, no_color=False) -> None: + def __init__(self, no_color=False, short_url=False) -> None: self._status: Dict[str, Status] = {} self._change = False self._no_color = no_color + self._short_url = short_url def on_init(self, builds: Sequence[Build]) -> None: for b in builds: @@ -94,6 +95,11 @@ def _render_status(self, status: Optional[Status]) -> str: clr = CM.get(status, def_color).value return f"{clr}{sn}{Color.END.value}" + def _url(self, build: Build) -> str: + if not self._short_url: + return build.log_url + return f"go/ghpc-cb/{build.id}" + def _render_link(self, build: Build) -> str: - name, url = trig_name(build), build.log_url + name, url = trig_name(build), self._url(build) return f"{name}\t{url}" diff --git a/tools/cloud-build/babysit/runner.py b/tools/cloud-build/babysit/runner.py index 3a54a4f26a..78e0b453b5 100644 --- a/tools/cloud-build/babysit/runner.py +++ b/tools/cloud-build/babysit/runner.py @@ -132,6 +132,7 @@ def run_from_cli(): parser.add_argument("--nocolor", action="store_true", help="Do not use color in output") cli_args = vars(parser.parse_args()) - ui = CliUI(no_color=cli_args.pop("nocolor")) + short_url = cli_args.get("project") == "hpc-toolkit-dev" + ui = CliUI(no_color=cli_args.pop("nocolor"), short_url=short_url) run(RunnerArgs(**cli_args), ui) From cd858169eb35df1ec5828f9d455fcdb0b6fdf73d Mon Sep 17 00:00:00 2001 From: Alyssa Date: Wed, 26 Jun 2024 17:12:33 +0000 Subject: [PATCH 071/118] Add local ssd RAID0 startup script --- modules/scripts/startup-script/README.md | 3 +- .../startup-script/files/setup-raid.yml | 76 +++++++++++++++++++ modules/scripts/startup-script/main.tf | 15 +++- modules/scripts/startup-script/variables.tf | 25 ++++++ modules/scripts/startup-script/versions.tf | 2 +- 5 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 modules/scripts/startup-script/files/setup-raid.yml diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index cb0f6582d0..7c1e4e7ee5 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -253,7 +253,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | >= 3.83 | | [local](#requirement\_local) | >= 2.0.0 | | [random](#requirement\_random) | ~> 3.0 | @@ -298,6 +298,7 @@ No modules. | [install\_docker](#input\_install\_docker) | Install Docker command line tool and daemon. | `bool` | `false` | no | | [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | +| [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. |
object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
})
|
{
"fs_type": "ext4",
"mountpoint": ""
}
| no | | [prepend\_ansible\_installer](#input\_prepend\_ansible\_installer) | DEPRECATED. Use `install_ansible=false` to prevent ansible installation. | `bool` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region to deploy to | `string` | n/a | yes | diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml new file mode 100644 index 0000000000..429d2b5594 --- /dev/null +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -0,0 +1,76 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Configure local SSDs + become: true + hosts: localhost + vars: + raid_name: localssd + array_dev: /dev/md/{{ raid_name }} + fstype: ext4 + interface: nvme + mode: '0755' + tasks: + - name: Get local SSD devices + ansible.builtin.find: + file_type: link + path: /dev/disk/by-id + patterns: google-local-{{ "nvme-" if interface == "nvme" else "" }}ssd-* + register: local_ssd_devices + + - name: Exit if zero local ssd found + ansible.builtin.meta: end_play + when: local_ssd_devices.files | length == 0 + + - name: Install mdadm + ansible.builtin.package: + name: mdadm + state: present + + - name: Force RAID array if only 1 local SSD + ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices=1 /dev/disk/by-id/google-local-nvme-ssd-0 --force + args: + creates: "{{ array_dev }}" + when: local_ssd_devices.files | length == 1 + + - name: Create RAID array + ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-* + args: + creates: "{{ array_dev }}" + when: local_ssd_devices.files | length >= 2 + + - name: Format filesystem + community.general.filesystem: + fstype: "{{ fstype }}" + device: "{{ array_dev }}" + opts: '{{ "-m 0" if fstype == "ext4" else "" }}' + + - name: Mount RAID array + ansible.posix.mount: + src: "{{ array_dev }}" + path: '{{ mountpoint | default("/mnt/" + raid_name) }}' + fstype: "{{ fstype }}" + # the nofail option is critical as it enables the boot process to complete on machines + # that were powered off and had local SSD contents discarded; without this option + # VMs may fail to join the network + opts: discard,defaults,nofail + state: mounted + + - name: Set mount permissions + ansible.builtin.file: + path: '{{ mountpoint | default("/mnt/" + raid_name) }}' + state: directory + mode: "{{ mode }}" diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 0486d78533..1f29afa72f 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -98,8 +98,20 @@ locals { }, ] + raid_setup = var.local_ssd_filesystem == {} ? [] : [ + { + type = "ansible-local" + destination = "setup-raid.yml" + content = file("${path.module}/files/setup-raid.yml") + args = join(" ", [ + "-e mountpoint=${var.local_ssd_filesystem.mountpoint}", + "-e fs_type=${var.local_ssd_filesystem.fs_type}", + ]) + }, + ] + supplied_ansible_runners = anytrue([for r in var.runners : r.type == "ansible-local"]) - has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker]) + has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, can(coalesce(var.local_ssd_filesystem.mountpoint))]) install_ansible = coalesce(var.install_ansible, local.has_ansible_runners) ansible_installer = local.install_ansible ? [{ type = "shell" @@ -122,6 +134,7 @@ locals { local.ansible_installer, local.configure_ssh_runners, local.docker_runner, + local.raid_setup, var.runners ) diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 56bd9d583d..2b78d96c06 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -126,6 +126,31 @@ variable "install_docker" { nullable = false } +variable "local_ssd_filesystem" { + description = "Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path." + type = object({ + fs_type = optional(string, "ext4") + mountpoint = optional(string, "") + }) + + validation { + condition = can(coalesce(var.local_ssd_filesystem.fs_type)) + error_message = "var.local_ssd_filesystem.fs_type must be set to a filesystem supported by the Linux distribution." + } + + validation { + condition = var.local_ssd_filesystem.mountpoint == "" || startswith(var.local_ssd_filesystem.mountpoint, "/") + error_message = "To enable local SSD filesystems, var.local_ssd_filesystem.mountpoint must be set to an absolute path to a mountpoint." + } + + default = { + fs_type = "ext4" + mountpoint = "" + } + + nullable = false +} + variable "install_cloud_ops_agent" { description = "Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true." type = bool diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 964e9e3567..111547e71f 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -33,5 +33,5 @@ terraform { module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.36.0" } - required_version = ">= 0.14.0" + required_version = ">= 1.3" } From 41f355d2f6101851560c518388503a648226294a Mon Sep 17 00:00:00 2001 From: alyssa-sm <146790241+alyssa-sm@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:11:25 -0700 Subject: [PATCH 072/118] Apply suggestions to avoid inclusion in all instances of startup-script Co-authored-by: Tom Downes --- modules/scripts/startup-script/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 1f29afa72f..99e26b4d05 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -98,7 +98,8 @@ locals { }, ] - raid_setup = var.local_ssd_filesystem == {} ? [] : [ + local_ssd_filesystem_enabled = can(coalesce(var.local_ssd_filesystem.mountpoint)) + raid_setup = local.local_ssd_filesystem_enabled ? [] : [ { type = "ansible-local" destination = "setup-raid.yml" @@ -111,7 +112,7 @@ locals { ] supplied_ansible_runners = anytrue([for r in var.runners : r.type == "ansible-local"]) - has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, can(coalesce(var.local_ssd_filesystem.mountpoint))]) + has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, local.local_ssd_filesystem_enabled]) install_ansible = coalesce(var.install_ansible, local.has_ansible_runners) ansible_installer = local.install_ansible ? [{ type = "shell" From e8057f65706e555059e90a1c8f4823a88194433a Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 31 Jul 2024 14:50:28 -0500 Subject: [PATCH 073/118] Update modules/scripts/startup-script/main.tf --- modules/scripts/startup-script/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 99e26b4d05..25e403dc09 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -99,7 +99,7 @@ locals { ] local_ssd_filesystem_enabled = can(coalesce(var.local_ssd_filesystem.mountpoint)) - raid_setup = local.local_ssd_filesystem_enabled ? [] : [ + raid_setup = !local.local_ssd_filesystem_enabled ? [] : [ { type = "ansible-local" destination = "setup-raid.yml" From 5e1c5985a92e02397fa50d600b660803bd98f622 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Wed, 31 Jul 2024 21:32:24 +0000 Subject: [PATCH 074/118] Update workflows to reference cluster-toolkit --- .github/workflows/dependency-review.yml | 2 +- .github/workflows/pr-label-validation.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index f0f99a2665..d858ec26aa 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -26,7 +26,7 @@ permissions: jobs: dependency-review: - if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' + if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' runs-on: ubuntu-latest steps: - name: 'Checkout Repository' diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index dda59b7161..9fe508fadf 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -32,7 +32,7 @@ on: jobs: pr-label-validation: - if: github.repository == 'GoogleCloudPlatform/hpc-toolkit' + if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' runs-on: ubuntu-latest permissions: pull-requests: read From 3a2495bf5f44068034880a9e981a1a03293693d9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 30 Jul 2024 22:29:57 +0000 Subject: [PATCH 075/118] Update to SlurmGCP 6.6.0 --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- community/examples/hpc-slurm-ubuntu2004.yaml | 2 +- community/examples/hpc-slurm6-apptainer.yaml | 2 +- .../README.md | 4 ++-- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../source_image_logic.tf | 10 +++++----- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../source_image_logic.tf | 10 +++++----- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 16 ++++++++-------- .../controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 6 +++--- .../source_image_logic.tf | 10 +++++----- .../variables_controller_instance.tf | 2 +- .../schedmd-slurm-gcp-v6-login/README.md | 2 +- .../source_image_logic.tf | 10 +++++----- .../schedmd-slurm-gcp-v6-login/variables.tf | 2 +- examples/cae/cae-slurm.yaml | 2 +- examples/hpc-enterprise-slurm.yaml | 2 +- examples/hpc-slurm-static.yaml | 2 +- examples/image-builder.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- examples/ml-slurm.yaml | 2 +- .../daily-tests/blueprints/lustre-slurm.yaml | 6 +++--- .../daily-tests/tests/slurm-v6-centos7.yml | 2 +- .../daily-tests/tests/slurm-v6-debian.yml | 2 +- .../test_configs/node-groups.yaml | 6 +++--- 31 files changed, 63 insertions(+), 63 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 31f30bd1a4..0eb1f71571 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -168,7 +168,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-6-5-hpc-rocky-linux-8 + family: slurm-gcp-6-6-hpc-rocky-linux-8 project: schedmd-slurm-public - id: low_cost_nodeset diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index cfc8a1ff02..ed3a587fb9 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -24,7 +24,7 @@ vars: slurm_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-6-5-ubuntu-2004-lts + family: slurm-gcp-6-6-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 49c0388ed2..09a02fa9d4 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-5-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d948cbefdf..d76c627bc3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.8 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | ## Resources @@ -104,7 +104,7 @@ modules. For support with the underlying modules, see the instructions in the | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index ca76477c4d..90e42683a4 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -54,7 +54,7 @@ data "google_compute_default_service_account" "default" { module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.8" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" project_id = var.project_id region = var.region diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index 40b2e53ef8..6198e7539d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-5-debian-11", - "slurm-gcp-6-5-hpc-rocky-linux-8", - "slurm-gcp-6-5-ubuntu-2004-lts", - "slurm-gcp-6-5-ubuntu-2204-lts-arm64", - "slurm-gcp-6-5-hpc-centos-7" + "slurm-gcp-6-6-debian-11", + "slurm-gcp-6-6-hpc-rocky-linux-8", + "slurm-gcp-6-6-ubuntu-2004-lts", + "slurm-gcp-6-6-ubuntu-2204-lts-arm64", + "slurm-gcp-6-6-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 1e23f2c815..504fd8bad5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -68,7 +68,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-5-hpc-rocky-linux-8" + family = "slurm-gcp-6-6-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index 3a39ff48c5..825860ac4b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -64,7 +64,7 @@ No modules. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-5-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf- | `string` | `null` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 341de73112..3761707b3e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -103,7 +103,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-5-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-6-tf-" type = string default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index a80ed5343e..a4ef29d0f1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -173,7 +173,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index 40b2e53ef8..6198e7539d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-5-debian-11", - "slurm-gcp-6-5-hpc-rocky-linux-8", - "slurm-gcp-6-5-ubuntu-2004-lts", - "slurm-gcp-6-5-ubuntu-2204-lts-arm64", - "slurm-gcp-6-5-hpc-centos-7" + "slurm-gcp-6-6-debian-11", + "slurm-gcp-6-6-hpc-rocky-linux-8", + "slurm-gcp-6-6-ubuntu-2004-lts", + "slurm-gcp-6-6-ubuntu-2204-lts-arm64", + "slurm-gcp-6-6-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index dcf1a4d9fa..3abfdca56e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -80,7 +80,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-5-hpc-rocky-linux-8" + family = "slurm-gcp-6-6-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 3ef36c189a..35e819ee5a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,14 +196,14 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.13 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.13 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.13 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.13 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.6.0 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.0 | ## Resources @@ -261,7 +261,7 @@ limitations under the License. | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index e0dae4e02d..2262acf718 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,7 +36,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" project_id = var.project_id region = var.region @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 3e1d980c70..103c224046 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -57,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index c51188859b..bd4b9486b9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local nodeset module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" for_each = local.nodeset_map project_id = var.project_id @@ -67,7 +67,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.6.0" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -87,7 +87,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.13" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.6.0" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index 40b2e53ef8..6198e7539d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-5-debian-11", - "slurm-gcp-6-5-hpc-rocky-linux-8", - "slurm-gcp-6-5-ubuntu-2004-lts", - "slurm-gcp-6-5-ubuntu-2204-lts-arm64", - "slurm-gcp-6-5-hpc-centos-7" + "slurm-gcp-6-6-debian-11", + "slurm-gcp-6-6-hpc-rocky-linux-8", + "slurm-gcp-6-6-ubuntu-2004-lts", + "slurm-gcp-6-6-ubuntu-2204-lts-arm64", + "slurm-gcp-6-6-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 72135de09f..d40cceec2a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -267,7 +267,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-5-hpc-rocky-linux-8" + family = "slurm-gcp-6-6-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index e1f1cfa233..1e5e31e774 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -100,7 +100,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index 40b2e53ef8..6198e7539d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-5-debian-11", - "slurm-gcp-6-5-hpc-rocky-linux-8", - "slurm-gcp-6-5-ubuntu-2004-lts", - "slurm-gcp-6-5-ubuntu-2204-lts-arm64", - "slurm-gcp-6-5-hpc-centos-7" + "slurm-gcp-6-6-debian-11", + "slurm-gcp-6-6-hpc-rocky-linux-8", + "slurm-gcp-6-6-ubuntu-2004-lts", + "slurm-gcp-6-6-ubuntu-2204-lts-arm64", + "slurm-gcp-6-6-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 41a130b955..880f56f440 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -325,7 +325,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-5-hpc-rocky-linux-8" + family = "slurm-gcp-6-6-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 4367e9741a..920d1967ba 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -40,7 +40,7 @@ vars: # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-6-5-hpc-rocky-linux-8 + family: slurm-gcp-6-6-hpc-rocky-linux-8 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 82755a1796..bea609e2cb 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -25,7 +25,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-5-hpc-rocky-linux-8 + family: slurm-gcp-6-6-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index fc1af6f5b7..41f2aac52c 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -29,7 +29,7 @@ vars: static_node_count: 2 ## Must be <= number of reserved machines ## slurm_instance_image: - family: slurm-gcp-6-5-hpc-rocky-linux-8 + family: slurm-gcp-6-6-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: false # true if using custom image in lines above bandwidth_tier: gvnic_enabled diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index e20c982f64..6bc7f6161d 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -59,7 +59,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-5-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 090f87b668..e9572dc52c 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.12 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.0 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 000677d4a0..81a78b59a1 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -139,7 +139,7 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-5-debian-11 + source_image_family: slurm-gcp-6-6-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 5d4324ce0b..9a0a7e41e0 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,10 +27,10 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 centos_image: - family: slurm-gcp-6-5-hpc-centos-7 + family: slurm-gcp-6-6-hpc-centos-7 project: schedmd-slurm-public rocky_image: - family: slurm-gcp-6-5-hpc-rocky-linux-8 + family: slurm-gcp-6-6-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -82,7 +82,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-6-5-ubuntu-2004-lts + # family: slurm-gcp-6-6-ubuntu-2004-lts # project: schedmd-slurm-public # - id: ubuntu_partition diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml index 8440b32c80..dcc3d41bc8 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "cent7{{ build[0:5] }}" cli_deployment_vars: network_name: "{{ network }}" - slurm_image: "{family: slurm-gcp-6-5-hpc-centos-7, project: schedmd-slurm-public}" + slurm_image: "{family: slurm-gcp-6-6-hpc-centos-7, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml index d3ba9d840b..77bbea5edc 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "debiv6{{ build[0:4] }}" cli_deployment_vars: network_name: "{{ network }}" - slurm_image: "{family: slurm-gcp-6-5-debian-11, project: schedmd-slurm-public}" + slurm_image: "{family: slurm-gcp-6-6-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index 0ab7ae31c1..52dc14f64c 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-6-5-debian-11 + family: slurm-gcp-6-6-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -75,7 +75,7 @@ deployment_groups: name: c60 machine_type: c2-standard-60 instance_image: - family: slurm-gcp-6-5-hpc-centos-7 + family: slurm-gcp-6-6-hpc-centos-7 project: schedmd-slurm-public - id: nodeset_3 @@ -85,7 +85,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-6-5-hpc-centos-7 + family: slurm-gcp-6-6-hpc-centos-7 project: schedmd-slurm-public instance_image_custom: true enable_smt: true From 7ccd089089dfae819f146b8269da5b60326e6d0f Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 01:06:00 +0000 Subject: [PATCH 076/118] TPU. Update JAX version and run in venv --- .../daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml index 8308182af8..ef2dadfa03 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-slurm-v6-tpu.yml @@ -48,7 +48,9 @@ delay: 100 ansible.builtin.command: | srun -N 1 -p tpu bash -c ' - pip install --upgrade 'jax[tpu]>0.3.0' -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + set -e + pip install --upgrade "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html + pip install --upgrade ml_dtypes python3 -c " import sys import jax From 25e7a4e8f8f7364047bc12d1e1bf50ed4da4fe21 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Wed, 31 Jul 2024 23:00:39 +0000 Subject: [PATCH 077/118] Update documentation to reflect rebrand to Cluster Toolkit --- README.md | 4 +-- community/front-end/ofe/docs/admin_guide.md | 12 ++++----- .../fsi-montecarlo-on-batch/README.md | 22 ++++++++-------- .../healthcare-and-life-sciences/README.md | 26 +++++++++---------- .../lysozyme-example/README.md | 4 +-- examples/cae/README.md | 20 +++++++------- .../machine-learning/a3-highgpu-8g/README.md | 16 ++++++------ 7 files changed, 52 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 82b9a50fb1..9586c9461c 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,8 @@ If a self directed path is preferred, you can use the following commands to build the `gcluster` binary: ```shell -git clone https://github.com/GoogleCloudPlatform/hpc-toolkit -cd hpc-toolkit +git clone https://github.com/GoogleCloudPlatform/cluster-toolkit +cd cluster-toolkit make ./gcluster --version ./gcluster --help diff --git a/community/front-end/ofe/docs/admin_guide.md b/community/front-end/ofe/docs/admin_guide.md index 770de22576..5f6f1c7f65 100644 --- a/community/front-end/ofe/docs/admin_guide.md +++ b/community/front-end/ofe/docs/admin_guide.md @@ -23,7 +23,7 @@ administrators, additional Django superusers can be created from the Admin site within TKFE, once it is deployed and running. The TFKE web application server uses the -[Cluster Toolkit](https://github.com/GoogleCloudPlatform/hpc-toolkit) to +[Cluster Toolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) to provision resources for networks, filesystems and clusters, using a service account that has its credentials registered to TKFE. The service account is used for access management and billing. @@ -66,8 +66,8 @@ Clone the repository, checkout the corresponding branch, and switch to the `frontend` directory as follows: ```bash -git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git -cd hpc-toolkit +git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git +cd cluster-toolkit cd community/frontend/ofe ``` @@ -268,7 +268,7 @@ admin has to click *Edit Subnet* to create at least one subnet in the VPC. Once the network and subnet(s) are defined, click the ‘Apply Cloud Changes’ button to trigger creation of the VPC and subnet(s). - + ### Import an existing VPC @@ -495,7 +495,7 @@ files will show errors from the Django web application. Cloud resource deployment log files (from Terraform) are typically shown via the FrontEnd web site. If those logs are not being shown, they can be found on the service machine under -`/opt/gcluster/hpc-toolkit/frontend/(clusters|fs|vpc)/...`. +`/opt/gcluster/cluster-toolkit/frontend/(clusters|fs|vpc)/...`. Cluster Toolkit log files will also be found in those directories. The Terraform log files and status files will be down a few directories, based off of the Cluster Number, Deployment ID, and Terraform directory. @@ -517,7 +517,7 @@ missing. Use the [IAM permissions reference](https://cloud.google.com/iam/docs/p to research this and identify additional roles to add to your user account. Before any attempt to redeploy the TKFE, make sure to run -`terraform destroy` in `hpc-toolkit/frontend/tf` to remove cloud resources that +`terraform destroy` in `cluster-toolkit/frontend/tf` to remove cloud resources that have been already created. ### Cluster problems diff --git a/docs/tutorials/fsi-montecarlo-on-batch/README.md b/docs/tutorials/fsi-montecarlo-on-batch/README.md index 3916cfdbe8..dcf6bd7674 100644 --- a/docs/tutorials/fsi-montecarlo-on-batch/README.md +++ b/docs/tutorials/fsi-montecarlo-on-batch/README.md @@ -45,20 +45,20 @@ via a [PubSub BigQuery subscription](https://cloud.google.com/pubsub/docs/bigque 1. Get the Cluster Toolkit configured. -Build the `ghpc` binary: +Build the `gcluster` binary: ```shell -git clone https://github.com/GoogleCloudPlatform/hpc-toolkit -cd hpc-toolkit +git clone https://github.com/GoogleCloudPlatform/cluster-toolkit +cd cluster-toolkit make -./ghpc --version -./ghpc --help +./gcluster --version +./gcluster --help ``` -2\. Run `ghpc` on the blueprint `fsi-montecarlo-on-batch.yaml` +2\. Run `gcluster` on the blueprint `fsi-montecarlo-on-batch.yaml` ```bash -./ghpc create community/examples/fsi-montecarlo-on-batch.yaml \ +./gcluster create community/examples/fsi-montecarlo-on-batch.yaml \ --vars "project_id=${GOOGLE_CLOUD_PROJECT}" ``` @@ -74,14 +74,14 @@ If successful, you will see output similar to:

To deploy your infrastructure please run: -./ghpc deploy fsimontecarlo +./gcluster deploy fsimontecarlo

3\. Deploy the blueprint as instructed: ```bash -./ghpc deploy fsimontecarlo +./gcluster deploy fsimontecarlo ``` If successful, this will prompt you: @@ -228,8 +228,8 @@ ensure you are not billed for any of the Cloud usage. ### Alternatively -The other choice is to run a `ghpc destroy` command. +The other choice is to run a `gcluster destroy` command. ```bash -./ghpc destroy fsimontecarlo +./gcluster destroy fsimontecarlo ``` diff --git a/docs/videos/healthcare-and-life-sciences/README.md b/docs/videos/healthcare-and-life-sciences/README.md index dfd8a231d5..de76730b9c 100644 --- a/docs/videos/healthcare-and-life-sciences/README.md +++ b/docs/videos/healthcare-and-life-sciences/README.md @@ -97,8 +97,8 @@ storage intact and b) you can build software before you deploy your cluster. 1. Clone the repo ```bash - git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git - cd hpc-toolkit + git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git + cd cluster-toolkit ``` 1. Build the Cluster Toolkit @@ -114,12 +114,12 @@ storage intact and b) you can build software before you deploy your cluster. the cloud buckets being destroyed, it is recommended you run: ```bash - ./ghpc create examples/hcls-blueprint.yaml -w --vars project_id= --vars bucket_force_delete=true + ./gcluster create examples/hcls-blueprint.yaml -w --vars project_id= --vars bucket_force_delete=true ``` The `bucket_force_delete` variable makes it easier to tear down the deployment. If it is set to the default value of `false`, buckets with - objects (files) will not be deleted and the `./ghpc destroy` command will + objects (files) will not be deleted and the `./gcluster destroy` command will fail partway through. If the data stored in the buckets should be preseverved, remove the @@ -127,10 +127,10 @@ storage intact and b) you can build software before you deploy your cluster. 1. Deploy the `enable_apis` group - Call the following ghpc command to deploy the the hcls blueprint. + Call the following gcluster command to deploy the the hcls blueprint. ```bash - ./ghpc deploy hcls-01 + ./gcluster deploy hcls-01 ``` This will prompt you to **display**, **apply**, **stop**, or **continue** @@ -140,20 +140,20 @@ storage intact and b) you can build software before you deploy your cluster. cluster. > [!WARNING] - > This ghpc command will run through 4 groups (`enable_apis`, `setup`, + > This gcluster command will run through 4 groups (`enable_apis`, `setup`, > `software_installation`, and `cluster`) and prompt you to apply each one. > If the command is cancelled or exited by accident before finishing, it can > be rerun to continue deploying the blueprint. 1. Deploy the `setup` group - The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + The next `gcluster` prompt will ask you to **display**, **apply**, **stop**, or **continue** without applying the `setup` group. Select 'apply'. This group will create a network and file systems to be used by the cluster. > [!NOTE] - > At this point do not proceed with the ghpc prompt for the `cluster` group. + > At this point do not proceed with the gcluster prompt for the `cluster` group. > Continue with the steps below before proceeding. This step will create a storage bucket for depositing software. The bucket @@ -163,7 +163,7 @@ storage intact and b) you can build software before you deploy your cluster. Here are two ways to locate the bucket name: - 1. At the end of the `setup` deployment, ghpc should output a line + 1. At the end of the `setup` deployment, gcluster should output a line `Outputs:`. Under that there should be a line similar to `gcs_bucket_path_bucket-software = "gs://hcls-user-provided-software-hcls-01-84d0b51e"`, the bucket name is located within the quotes after `gs://` @@ -200,7 +200,7 @@ storage intact and b) you can build software before you deploy your cluster. 1. Deploy the `software_installation` group. Once the file from the prior step has been completely uploaded, you can - return to the ghpc command which will ask you to **display**, **apply**, + return to the gcluster command which will ask you to **display**, **apply**, **stop**, or **continue** without applying the `software_installation` group. Select 'apply'. @@ -221,7 +221,7 @@ storage intact and b) you can build software before you deploy your cluster. 1. Deploy the `cluster` group - The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + The next `gcluster` prompt will ask you to **display**, **apply**, **stop**, or **continue** without applying the `cluster` group. Select 'apply'. This deployment group contains the Slurm cluster and the Chrome remote @@ -249,7 +249,7 @@ destroyed first. You can use the following commands to destroy the deployment. > associated costs. ```bash -./ghpc destroy hcls-01 --auto-approve +./gcluster destroy hcls-01 --auto-approve ``` > [!NOTE] diff --git a/docs/videos/healthcare-and-life-sciences/lysozyme-example/README.md b/docs/videos/healthcare-and-life-sciences/lysozyme-example/README.md index 0922f38b49..44c9741506 100644 --- a/docs/videos/healthcare-and-life-sciences/lysozyme-example/README.md +++ b/docs/videos/healthcare-and-life-sciences/lysozyme-example/README.md @@ -49,8 +49,8 @@ button or by any other means. 1. Copy the contents of this directory into the submission directory ```bash - git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git - cp -r hpc-toolkit/docs/videos/healthcare-and-life-sciences/lysozyme-example/* . + git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git + cp -r cluster-toolkit/docs/videos/healthcare-and-life-sciences/lysozyme-example/* . ``` 1. Copy the Lysozyme protein into the submission directory diff --git a/examples/cae/README.md b/examples/cae/README.md index 1e50aa964a..6107bf831e 100644 --- a/examples/cae/README.md +++ b/examples/cae/README.md @@ -95,8 +95,8 @@ storage intact and b) you can build software before you deploy your cluster. 1. Clone the repo ```bash - git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git - cd hpc-toolkit + git clone https://github.com/GoogleCloudPlatform/cluster-toolkit.git + cd cluster-toolkit ``` 1. Build the Cluster Toolkit @@ -109,24 +109,24 @@ storage intact and b) you can build software before you deploy your cluster. id. ```bash - ./ghpc create community/examples/cae-slurm.yaml -w --vars project_id= + ./gcluster create community/examples/cae-slurm.yaml -w --vars project_id= ``` 1. Deploy the `setup` group - Call the following ghpc command to deploy the cae-slurm blueprint. + Call the following gcluster command to deploy the cae-slurm blueprint. ```bash - ./ghpc deploy cae-slurm + ./gcluster deploy cae-slurm ``` - The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + The next `gcluster` prompt will ask you to **display**, **apply**, **stop**, or **continue** without applying the `setup` group. Select 'apply'. This group will create a network and file systems to be used by the cluster. > [!WARNING] - > This ghpc command will run through 2 deployment groups (3 if you populate + > This gcluster command will run through 2 deployment groups (3 if you populate > & activate the `software_installation` stage) and prompt you to apply each one. > If the command is cancelled or exited by accident before finishing, it can > be rerun to continue deploying the blueprint. @@ -148,13 +148,13 @@ storage intact and b) you can build software before you deploy your cluster. > [Software Installation Patterns](#software-installation-patterns) for more information. If this deployment group is used (needs to be uncomment in the blueprint first), - you can return to the ghpc command which will ask you to **display**, **apply**, + you can return to the gcluster command which will ask you to **display**, **apply**, **stop**, or **continue** without applying the `software_installation` group. Select 'apply'. 1. Deploy the `cluster` group - The next `ghpc` prompt will ask you to **display**, **apply**, **stop**, or + The next `gcluster` prompt will ask you to **display**, **apply**, **stop**, or **continue** without applying the `cluster` group. Select 'apply'. This deployment group contains the Slurm cluster and with compute partitions @@ -182,7 +182,7 @@ commands to destroy the deployment in this reverse order. You will be prompted to confirm the deletion of each stage. ```bash -./ghpc destroy cae-slurm +./gcluster destroy cae-slurm ``` > [!WARNING] diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md index a4ac24646e..86aa5232be 100644 --- a/examples/machine-learning/a3-highgpu-8g/README.md +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -22,7 +22,7 @@ Please follow the initial instructions for: Verify that your release of the Cluster Toolkit is 1.31.1 or later. ```shell -ghpc --version +gcluster --version ``` The solution requires several Python packages to be available. We recommend @@ -35,7 +35,7 @@ pip3 install -r \ https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt ``` -**Always** activate the environment before running any ghpc commands such as +**Always** activate the environment before running any gcluster commands such as deploy or destroy. ```shell @@ -194,7 +194,7 @@ and a Filestore `/home` filesystem. Run the standard Toolkit workflow at the command line (approx. 5 minutes): ```shell -ghpc deploy ml-slurm-a3-0-base.yaml --auto-approve +gcluster deploy ml-slurm-a3-0-base.yaml --auto-approve ``` Several values will be output to the screen. The output will be similar to: @@ -226,7 +226,7 @@ Build the custom image using ml-slurm-a3-1-image.yaml and the same workflow as above. Run at the command line: ```shell -ghpc deploy ml-slurm-a3-1-image.yaml --auto-approve +gcluster deploy ml-slurm-a3-1-image.yaml --auto-approve ``` The image will take approximately 30 minutes to build. @@ -243,7 +243,7 @@ The image will take approximately 30 minutes to build. Provision the cluster blueprint (approximately 5-10 minutes): ```shell -ghpc deploy ml-slurm-a3-2-cluster.yaml --auto-approve +gcluster deploy ml-slurm-a3-2-cluster.yaml --auto-approve ``` ## Receive Data Path Manager (RxDM) @@ -307,8 +307,8 @@ using an alternative image. ### Clone the Cluster Toolkit repository containing the NCCL benchmark ```shell -git clone https://github.com/GoogleCloudPlatform/hpc-toolkit -cd hpc-toolkit/examples/machine-learning/a3-highgpu-8g/nccl-tests +git clone https://github.com/GoogleCloudPlatform/cluster-toolkit +cd cluster-toolkit/examples/machine-learning/a3-highgpu-8g/nccl-tests ``` ### Import the PyTorch image from the NVIDIA Container Registry @@ -331,4 +331,4 @@ sbatch run-nccl-tests.sh [consume]: https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_any_matching_reservation [tkdeps]: https://cloud.google.com/cluster-toolkit/docs/setup/install-dependencies -[tkinstall]: https://github.com/GoogleCloudPlatform/hpc-toolkit/#quickstart +[tkinstall]: https://github.com/GoogleCloudPlatform/cluster-toolkit/#quickstart From f702ab011227ece0007f64b920efae4a6f0ad72e Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 21:23:00 +0000 Subject: [PATCH 078/118] Babysit. Make CLI UI less verbose --- tools/cloud-build/babysit/cli_ui.py | 15 ++++++++++----- tools/cloud-build/babysit/runner.py | 1 - 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index 2b16e2c3ca..8831bbe070 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -16,6 +16,7 @@ from typing import Sequence, Dict, Optional import time from enum import Enum +from collections import defaultdict from .core import Status, Build, latest_by_trigger, trig_name @@ -40,7 +41,6 @@ def on_init(self, builds: Sequence[Build]) -> None: if not builds: print("found no builds") else: - print(f"found {len(builds)} builds:") self._render_summary(builds) def on_done(self, builds: Sequence[Build]) -> None: @@ -51,24 +51,29 @@ def on_done(self, builds: Sequence[Build]) -> None: def on_update(self, builds: Sequence[Build]) -> None: for b in builds: if b.status != self._status.get(b.id): - br = self._render_build(b) - sr = self._render_status(self._status.get(b.id)) - print(f"status update: {sr} > {br}") + print(self._render_build(b)) self._change = True self._status[b.id] = b.status def on_action(self, action: str, build: Build) -> None: - print(f"{action} {self._render_build(build)}") + pass def sleep(self, sec: int) -> None: time.sleep(sec) def _render_summary(self, builds: Sequence[Build]) -> None: order_fn = lambda bc: (bc.build.status, trig_name(bc.build)) + cnt = defaultdict(int) ordered = sorted(latest_by_trigger(builds).values(), key=order_fn) for bc in ordered: print(self._render_build(bc.build, bc.count)) + cnt[bc.build.status] += 1 + + print(f"------- TOTAL:{sum(cnt.values())} ", end="") + for s, c in cnt.items(): + print(f"| {self._render_status(s)}: {c} ", end="") + print("") def _color(self) -> bool: if self._no_color: return False diff --git a/tools/cloud-build/babysit/runner.py b/tools/cloud-build/babysit/runner.py index 78e0b453b5..a9c05354ed 100644 --- a/tools/cloud-build/babysit/runner.py +++ b/tools/cloud-build/babysit/runner.py @@ -54,7 +54,6 @@ def get_changed_files_tags(files: Collection[str]) -> set[str]: parts = f.split("/") if len(parts) < 3: continue tags.add(f"m.{parts[2]}") - print(f"Auto tags: {tags}") # TODO: use UI to log return tags @dataclass From 1cde207375955e7a486f3ff3f7f993f8ab6ca542 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Thu, 1 Aug 2024 00:00:58 +0000 Subject: [PATCH 079/118] Update integration tests to use gcluster --- .../base-integration-test.yml | 18 ++++++++--------- .../htcondor-integration-test.yml | 20 +++++++++---------- .../multigroup-integration-test.yml | 14 ++++++------- .../slurm-integration-test.yml | 20 +++++++++---------- ...ailure.yml => rescue_gcluster_failure.yml} | 6 +++--- 5 files changed, 39 insertions(+), 39 deletions(-) rename tools/cloud-build/daily-tests/ansible_playbooks/tasks/{rescue_ghpc_failure.yml => rescue_gcluster_failure.yml} (89%) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml index 592637f351..8fd04acac8 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml @@ -14,7 +14,7 @@ --- -- name: "Setup Integration tests for HPC toolkit" +- name: "Setup Integration tests for Cluster Toolkit" hosts: localhost tasks: ## Create SSH Keys @@ -46,10 +46,10 @@ - name: Create Infrastructure and test block: - - name: Create Cluster with GHPC + - name: Create Cluster with gcluster register: deployment changed_when: deployment.changed - ansible.builtin.command: ./ghpc deploy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: @@ -137,10 +137,10 @@ ## Cleanup and fail gracefully rescue: - - name: Capture ghpc stderr + - name: Capture gcluster stderr failed_when: false ansible.builtin.set_fact: - ghpc_stderr: "{{ deployment.stderr | replace('\n',' ') }}" + gcluster_stderr: "{{ deployment.stderr | replace('\n',' ') }}" - name: Gather logs ansible.builtin.include_tasks: @@ -148,9 +148,9 @@ apply: delegate_to: localhost - - name: Include rescue from ghpc failure + - name: Include rescue from gcluster failure ansible.builtin.include_tasks: - file: tasks/rescue_ghpc_failure.yml + file: tasks/rescue_gcluster_failure.yml apply: delegate_to: localhost vars: @@ -177,7 +177,7 @@ vars: timeout_seconds: "{{ startup_timeout_seconds }}" - - name: Run Integration tests for HPC toolkit + - name: Run Integration tests for Cluster Toolkit ansible.builtin.include_tasks: "{{ test }}" vars: remote_node: "{{ remote_node }}" @@ -190,7 +190,7 @@ always: - name: Cleanup firewall and infrastructure ansible.builtin.include_tasks: - file: tasks/rescue_ghpc_failure.yml + file: tasks/rescue_gcluster_failure.yml apply: delegate_to: localhost vars: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index 02b7d8a31c..74c7c2ea46 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -14,7 +14,7 @@ --- -- name: "Setup Integration tests for HPC toolkit" +- name: "Setup Integration tests for Cluster Toolkit" hosts: localhost tasks: ## Create SSH Keys @@ -44,10 +44,10 @@ file: tasks/create_deployment_directory.yml - name: Create Infrastructure and test block: - - name: Execute ghpc deploy + - name: Execute gcluster deploy register: deployment changed_when: deployment.changed - ansible.builtin.command: ./ghpc deploy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: @@ -115,10 +115,10 @@ - delete - "{{ deployment_name }}" - name: Destroy deployment - register: ghpc_destroy - changed_when: ghpc_destroy.changed + register: gcluster_destroy + changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./ghpc destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: @@ -158,7 +158,7 @@ port: 9618 delay: 10 timeout: 480 - - name: Run Integration tests for HPC toolkit + - name: Run Integration tests for Cluster Toolkit ansible.builtin.include_tasks: "{{ test }}" vars: access_point: "{{ access_point }}" @@ -180,10 +180,10 @@ - "{{ deployment_name }}" - name: Destroy deployment delegate_to: localhost - register: ghpc_destroy - changed_when: ghpc_destroy.changed + register: gcluster_destroy + changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./ghpc destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index 6c9aa2127b..939c17bb17 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -14,7 +14,7 @@ --- -- name: "Multigroup integration test for ghpc deploy command" +- name: "Multigroup integration test for gcluster deploy command" hosts: localhost force_handlers: true vars: @@ -25,20 +25,20 @@ file: tasks/create_deployment_directory.yml - name: Deploy from deployment directory block: - - name: Execute ghpc deploy + - name: Execute gcluster deploy register: deployment changed_when: deployment.changed - ansible.builtin.command: ./ghpc deploy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: TF_IN_AUTOMATION: "TRUE" always: - name: Destroy deployment - register: ghpc_destroy - changed_when: ghpc_destroy.changed + register: gcluster_destroy + changed_when: gcluster_destroy.changed ignore_errors: true - ansible.builtin.command: ./ghpc destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: @@ -58,6 +58,6 @@ - packer_group_name is defined - packer_module_id is defined - name: Trigger Cloud Build failure - when: ghpc_destroy is failed or image_deletion is failed + when: gcluster_destroy is failed or image_deletion is failed ansible.builtin.fail: msg: "Failed while destroying deployment or deleting custom VM image" diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index 0fa3a6e313..4afc25457a 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -14,7 +14,7 @@ --- -- name: "Setup Integration tests for HPC toolkit" +- name: "Setup Integration tests for Cluster Toolkit" hosts: localhost tasks: ## Create SSH Keys @@ -53,10 +53,10 @@ loop_control: loop_var: pre_deploy_task - - name: Create Cluster with GHPC + - name: Create Cluster with gcluster register: deployment changed_when: deployment.changed - ansible.builtin.command: ./ghpc deploy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster deploy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: @@ -155,10 +155,10 @@ ## Cleanup and fail gracefully rescue: - - name: Capture ghpc stderr + - name: Capture gcluster stderr failed_when: false ansible.builtin.set_fact: - ghpc_stderr: "{{ deployment.stderr }}" + gcluster_stderr: "{{ deployment.stderr }}" - name: Gather logs ansible.builtin.include_tasks: @@ -166,9 +166,9 @@ apply: delegate_to: localhost - - name: Include rescue from ghpc failure + - name: Include rescue from gcluster failure ansible.builtin.include_tasks: - file: tasks/rescue_ghpc_failure.yml + file: tasks/rescue_gcluster_failure.yml apply: delegate_to: localhost vars: @@ -187,7 +187,7 @@ - name: Slurm Test Block vars: ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" - ansible_remote_tmp: "/tmp/ghpc/" + ansible_remote_tmp: "/tmp/gcluster/" block: - name: Wait until host is reachable ansible.builtin.wait_for_connection: @@ -213,7 +213,7 @@ retries: 60 delay: 15 - - name: Run Integration tests for HPC toolkit + - name: Run Integration tests for Cluster Toolkit ansible.builtin.include_tasks: "{{ test }}" vars: login_node: "{{ login_node }}" @@ -307,7 +307,7 @@ - name: Cleanup firewall and infrastructure ansible.builtin.include_tasks: - file: tasks/rescue_ghpc_failure.yml + file: tasks/rescue_gcluster_failure.yml apply: delegate_to: localhost vars: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_ghpc_failure.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml similarity index 89% rename from tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_ghpc_failure.yml rename to tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml index 9cba1cc3f2..29280d638e 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_ghpc_failure.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml @@ -34,10 +34,10 @@ block: - name: Destroy deployment - register: ghpc_destroy - changed_when: ghpc_destroy.changed + register: gcluster_destroy + changed_when: gcluster_destroy.changed run_once: true - ansible.builtin.command: ./ghpc destroy {{ deployment_name }} --auto-approve + ansible.builtin.command: ./gcluster destroy {{ deployment_name }} --auto-approve args: chdir: "{{ workspace }}" environment: From 297af20848679d1983ae7e1927b0a75012c1a0cf Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 22:39:33 +0000 Subject: [PATCH 080/118] SlurmGCP move to 5.12 --- .../examples/AMD/hpc-amd-slurm-v5-legacy.yaml | 2 +- .../hpc-slurm-chromedesktop-v5-legacy.yaml | 4 ++-- .../hpc-slurm-ubuntu2004-v5-legacy.yaml | 2 +- .../schedmd-slurm-gcp-v5-node-group/README.md | 2 +- .../source_image_logic.tf | 10 +++++----- .../variables.tf | 4 ++-- .../README.md | 2 +- .../main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-controller/README.md | 6 +++--- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 ++-- .../source_image_logic.tf | 10 +++++----- .../variables.tf | 4 ++-- .../schedmd-slurm-gcp-v5-hybrid/README.md | 2 +- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 6 +++--- .../schedmd-slurm-gcp-v5-login/main.tf | 4 ++-- .../source_image_logic.tf | 10 +++++----- .../schedmd-slurm-gcp-v5-login/variables.tf | 4 ++-- .../demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 ++-- .../on-prem-instructions.md | 18 +++++++++--------- docs/image-building.md | 2 +- examples/cae/cae-slurm-v5-legacy.yaml | 4 ++-- examples/hpc-enterprise-slurm-v5-legacy.yaml | 2 +- examples/image-builder-v5-legacy.yaml | 2 +- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- examples/ml-slurm-v5-legacy.yaml | 2 +- .../blueprints/lustre-slurm-v5-legacy.yaml | 6 +++--- .../daily-tests/tests/slurm-v5-debian.yml | 2 +- .../daily-tests/tests/slurm-v5-rocky8.yml | 2 +- .../test_configs/node-groups-v5-legacy.yaml | 4 ++-- .../slurm-static-test-v5-legacy.yaml | 8 ++++---- 36 files changed, 74 insertions(+), 74 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml b/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml index eb3a0ed56e..c92044511f 100644 --- a/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml +++ b/community/examples/AMD/hpc-amd-slurm-v5-legacy.yaml @@ -171,7 +171,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public - id: low_cost_node_group diff --git a/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml b/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml index 3c1011f6a2..1a223d55cc 100644 --- a/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml +++ b/community/examples/hpc-slurm-chromedesktop-v5-legacy.yaml @@ -22,10 +22,10 @@ vars: region: us-central1 zone: us-central1-c instance_image_crd: - family: slurm-gcp-5-11-debian-11 + family: slurm-gcp-5-12-debian-11 project: schedmd-slurm-public instance_image: - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml index dfbcb2ee60..6b1875353a 100644 --- a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml +++ b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml @@ -24,7 +24,7 @@ vars: instance_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-5-11-ubuntu-2004-lts + family: slurm-gcp-5-12-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 865b464993..295954afa3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -136,7 +136,7 @@ No modules. | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-11-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index a1a13a8a6b..a92bd5fc8d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-11-debian-11", - "slurm-gcp-5-11-hpc-rocky-linux-8", - "slurm-gcp-5-11-ubuntu-2004-lts", - "slurm-gcp-5-11-ubuntu-2204-lts-arm64", - "slurm-gcp-5-11-hpc-centos-7" + "slurm-gcp-5-12-debian-11", + "slurm-gcp-5-12-hpc-rocky-linux-8", + "slurm-gcp-5-12-ubuntu-2004-lts", + "slurm-gcp-5-12-ubuntu-2204-lts-arm64", + "slurm-gcp-5-12-hpc-centos-7" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 08eb237433..ca45a0333c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 variable "project_id" { description = "Project in which the HPC deployment will be created." @@ -96,7 +96,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-11-hpc-centos-7" + family = "slurm-gcp-5-12-hpc-centos-7" } validation { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index c936d26b6e..d48e9ebbc0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.11.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 48a4e50eb1..ed9365c4e6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index 9167de48e2..d0c1ba162d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index 5b88c2eda2..47d99a42d7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.11.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 145908b8e2..52e9d0a7d2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -40,7 +40,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index c68ac1fa81..45e87da037 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index d23146ac2e..c229e00a76 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -215,8 +215,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.11.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.11.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.0 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | ## Resources @@ -260,7 +260,7 @@ limitations under the License. | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-11-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index e549b2d6dc..bce1a83cf2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -55,7 +55,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.0" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -93,7 +93,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index a1a13a8a6b..a92bd5fc8d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-11-debian-11", - "slurm-gcp-5-11-hpc-rocky-linux-8", - "slurm-gcp-5-11-ubuntu-2004-lts", - "slurm-gcp-5-11-ubuntu-2204-lts-arm64", - "slurm-gcp-5-11-hpc-centos-7" + "slurm-gcp-5-12-debian-11", + "slurm-gcp-5-12-hpc-rocky-linux-8", + "slurm-gcp-5-12-ubuntu-2004-lts", + "slurm-gcp-5-12-ubuntu-2204-lts-arm64", + "slurm-gcp-5-12-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 86e0e69aab..7dd719db17 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." @@ -557,7 +557,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-11-hpc-centos-7" + family = "slurm-gcp-5-12-hpc-centos-7" } validation { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index ba9793fa11..ed64a21351 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.11.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.0 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 8c8ddbaa47..7b8eb1171d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.0" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 38d2fea7c9..3acccf33fa 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -82,8 +82,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.11.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.11.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.0 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | ## Resources @@ -113,7 +113,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-11-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index 0ff74246e7..c925a1c229 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.11.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.0" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index a1a13a8a6b..a92bd5fc8d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -18,11 +18,11 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-5-11-debian-11", - "slurm-gcp-5-11-hpc-rocky-linux-8", - "slurm-gcp-5-11-ubuntu-2004-lts", - "slurm-gcp-5-11-ubuntu-2204-lts-arm64", - "slurm-gcp-5-11-hpc-centos-7" + "slurm-gcp-5-12-debian-11", + "slurm-gcp-5-12-hpc-rocky-linux-8", + "slurm-gcp-5-12-ubuntu-2004-lts", + "slurm-gcp-5-12-ubuntu-2204-lts-arm64", + "slurm-gcp-5-12-hpc-centos-7" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 90be6b467d..54749e9e8e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 variable "project_id" { type = string @@ -296,7 +296,7 @@ variable "instance_image" { type = map(string) default = { project = "schedmd-slurm-public" - family = "slurm-gcp-5-11-hpc-centos-7" + family = "slurm-gcp-5-12-hpc-centos-7" } validation { diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index e7de6c1e44..3127520def 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 79e6bfbcb1..d2ece80fce 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 -[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 6a6300f57f..91505dc401 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 [slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/docs/hybrid.md +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md ### NFS Mounts @@ -224,7 +224,7 @@ image created with slurm 21.08.8: node_count_dynamic_max: 20 instance_image: project: $(vars.project_id) - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 - id: compute-partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/packer -[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 78d1014cec..488c1b19ef 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.11.1&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.0&depth=1 kind: packer settings: use_iap: true diff --git a/examples/cae/cae-slurm-v5-legacy.yaml b/examples/cae/cae-slurm-v5-legacy.yaml index f6ef73ddef..e1a5411252 100644 --- a/examples/cae/cae-slurm-v5-legacy.yaml +++ b/examples/cae/cae-slurm-v5-legacy.yaml @@ -40,10 +40,10 @@ vars: # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public crd_instance_image: - family: slurm-gcp-5-11-debian-11 # must be Debian for CRD + family: slurm-gcp-5-12-debian-11 # must be Debian for CRD project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm-v5-legacy.yaml b/examples/hpc-enterprise-slurm-v5-legacy.yaml index 6b24a70a06..7c79b818ec 100644 --- a/examples/hpc-enterprise-slurm-v5-legacy.yaml +++ b/examples/hpc-enterprise-slurm-v5-legacy.yaml @@ -25,7 +25,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/examples/image-builder-v5-legacy.yaml b/examples/image-builder-v5-legacy.yaml index 0c7f9e73d9..c48627f85d 100644 --- a/examples/image-builder-v5-legacy.yaml +++ b/examples/image-builder-v5-legacy.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-11-hpc-centos-7 + source_image_family: slurm-gcp-5-12-hpc-centos-7 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 46f9ae59db..6063b2eea8 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -92,7 +92,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.11.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.0 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/ml-slurm-v5-legacy.yaml b/examples/ml-slurm-v5-legacy.yaml index ebecd1ccbd..6c0fb8aa30 100644 --- a/examples/ml-slurm-v5-legacy.yaml +++ b/examples/ml-slurm-v5-legacy.yaml @@ -171,7 +171,7 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-5-11-debian-11 + source_image_family: slurm-gcp-5-12-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml index a51d7f0899..e7e9d5e09e 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm-v5-legacy.yaml @@ -27,10 +27,10 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 centos_image: - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public rocky_image: - family: slurm-gcp-5-11-hpc-rocky-linux-8 + family: slurm-gcp-5-12-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -81,7 +81,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-5-11-ubuntu-2004-lts + # family: slurm-gcp-5-12-ubuntu-2004-lts # project: schedmd-slurm-public # - id: ubuntu_partition diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml index 04c3a4ecec..002730c1ae 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-debian.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "debiv5{{ build[0:4] }}" cli_deployment_vars: network_name: "{{ network }}" - instance_image: "{family: slurm-gcp-5-11-debian-11, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-12-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml index f78dfea24b..7120a56973 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-rocky8.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "rock8{{ build[0:5] }}" cli_deployment_vars: network_name: "{{ network }}" - instance_image: "{family: slurm-gcp-5-11-hpc-rocky-linux-8, project: schedmd-slurm-public}" + instance_image: "{family: slurm-gcp-5-12-hpc-rocky-linux-8, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml b/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml index de6747b481..9dcd1332bc 100644 --- a/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml +++ b/tools/validate_configs/test_configs/node-groups-v5-legacy.yaml @@ -61,7 +61,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-5-11-debian-11 + family: slurm-gcp-5-12-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -80,7 +80,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-5-11-hpc-centos-7 + family: slurm-gcp-5-12-hpc-centos-7 project: schedmd-slurm-public instance_image_custom: true enable_smt: true diff --git a/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml b/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml index 31a08e5626..46e7eb37f0 100644 --- a/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml +++ b/tools/validate_configs/test_configs/slurm-static-test-v5-legacy.yaml @@ -25,10 +25,10 @@ vars: instance_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - # family: slurm-gcp-5-11-ubuntu-2004-lts - # family: slurm-gcp-5-11-hpc-centos-7 - family: slurm-gcp-5-11-hpc-rocky-linux-8 - # family: slurm-gcp-5-11-debian-11 + # family: slurm-gcp-5-12-ubuntu-2004-lts + # family: slurm-gcp-5-12-hpc-centos-7 + family: slurm-gcp-5-12-hpc-rocky-linux-8 + # family: slurm-gcp-5-12-debian-11 project: schedmd-slurm-public instance_image_custom: true enable_reconfigure: true From 8445a982600c4ecfb88c3fe748b66771c1a5d5a0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 20:17:22 +0000 Subject: [PATCH 081/118] Stop using SlurmgGCP nodeset module --- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 8 +- .../schedmd-slurm-gcp-v6-partition/README.md | 2 +- .../variables.tf | 103 ++---------------- .../schedmd-slurm-gcp-v6-controller/README.md | 3 +- .../modules/slurm_files/main.tf | 21 ++-- .../modules/slurm_files/variables.tf | 2 +- .../partition.tf | 35 +++--- .../slurm_files.tf | 9 +- .../variables.tf | 6 +- 9 files changed, 50 insertions(+), 139 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index be62e6da1f..ff66192090 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -91,10 +91,12 @@ locals { reservation_name = local.reservation_name maintenance_interval = var.maintenance_interval - zones = toset(concat([var.zone], tolist(var.zones))) zone_target_shape = var.zone_target_shape - startup_script = local.ghpc_startup_script - network_storage = var.network_storage + zone_policy_allow = toset(concat([var.zone], tolist(var.zones))) + zone_policy_deny = toset([]) + + startup_script = local.ghpc_startup_script + network_storage = var.network_storage } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 08cfb3f5ac..33beb3418f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -85,7 +85,7 @@ No resources. | [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | DEPRECATED |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
network_tier = optional(string, "STANDARD")
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
subnetwork_self_link = string
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | A list of nodesets.
For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | `list(any)` | `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 88f9ba262d..807ad3847e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -47,104 +47,15 @@ variable "exclusive" { } variable "nodeset" { - description = "Define nodesets, as a list." - type = list(object({ - node_count_static = optional(number, 0) - node_count_dynamic_max = optional(number, 1) - node_conf = optional(map(string), {}) - nodeset_name = string - additional_disks = optional(list(object({ - disk_name = optional(string) - device_name = optional(string) - disk_size_gb = optional(number) - disk_type = optional(string) - disk_labels = optional(map(string), {}) - auto_delete = optional(bool, true) - boot = optional(bool, false) - })), []) - bandwidth_tier = optional(string, "platform_default") - can_ip_forward = optional(bool, false) - disable_smt = optional(bool, false) - disk_auto_delete = optional(bool, true) - disk_labels = optional(map(string), {}) - disk_size_gb = optional(number) - disk_type = optional(string) - enable_confidential_vm = optional(bool, false) - enable_placement = optional(bool, false) - enable_oslogin = optional(bool, true) - enable_shielded_vm = optional(bool, false) - gpu = optional(object({ - count = number - type = string - })) - instance_template = optional(string) - labels = optional(map(string), {}) - machine_type = optional(string) - maintenance_interval = optional(string) - metadata = optional(map(string), {}) - min_cpu_platform = optional(string) - network_storage = optional(list(object({ - server_ip = string - remote_mount = string - local_mount = string - fs_type = string - mount_options = string - })), []) - network_tier = optional(string, "STANDARD") - on_host_maintenance = optional(string) - preemptible = optional(bool, false) - region = optional(string) - service_account = optional(object({ - email = optional(string) - scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"]) - })) - shielded_instance_config = optional(object({ - enable_integrity_monitoring = optional(bool, true) - enable_secure_boot = optional(bool, true) - enable_vtpm = optional(bool, true) - })) - source_image_family = optional(string) - source_image_project = optional(string) - source_image = optional(string) - additional_networks = optional(list(object({ - network = string - subnetwork = string - subnetwork_project = string - network_ip = string - nic_type = string - stack_type = string - queue_count = number - access_config = list(object({ - nat_ip = string - network_tier = string - })) - ipv6_access_config = list(object({ - network_tier = string - })) - alias_ip_range = list(object({ - ip_cidr_range = string - subnetwork_range_name = string - })) - }))) - access_config = optional(list(object({ - nat_ip = string - network_tier = string - }))) - subnetwork_self_link = string - spot = optional(bool, false) - tags = optional(list(string), []) - termination_action = optional(string) - zones = optional(list(string), []) - zone_target_shape = optional(string, "ANY_SINGLE_ZONE") - reservation_name = optional(string) - startup_script = optional(list(object({ - filename = string - content = string })), []) - })) - default = [] + description = <<-EOD + A list of nodesets. + For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset + EOD + type = list(any) + default = [] validation { - condition = length(distinct([for x in var.nodeset : x.nodeset_name])) == length(var.nodeset) + condition = length(distinct(var.nodeset[*].nodeset_name)) == length(var.nodeset) error_message = "All nodesets must have a unique name." } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 775e97412f..7e130718af 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -201,7 +201,6 @@ limitations under the License. | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | | [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | | [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.6.0 | | [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | | [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.0 | @@ -273,7 +272,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
zone_policy_allow = optional(set(string), [])
zone_policy_deny = optional(set(string), [])
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 40a38c5409..7e7f39fa0f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -66,10 +66,11 @@ locals { prolog_scripts = [for k, v in google_storage_bucket_object.prolog_scripts : k] epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - partitions = local.partitions - nodeset = local.nodeset - nodeset_dyn = local.nodeset_dyn - nodeset_tpu = local.nodeset_tpu + + partitions = { for p in var.partitions : p.partition_name => p } + nodeset = { for n in var.nodeset : n.nodeset_name => n } + nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } + nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } # hybrid hybrid = var.enable_hybrid @@ -95,15 +96,9 @@ locals { config_yaml = "config.yaml" config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) - partitions = { for p in var.partitions[*].partition : p.partition_name => p } - - nodeset = { for n in var.nodeset[*].nodeset : n.nodeset_name => n } - nodeset_dyn = { for n in var.nodeset_dyn[*].nodeset : n.nodeset_name => n } - nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } - - x_nodeset = toset([for k, v in local.nodeset : v.nodeset_name]) - x_nodeset_dyn = toset([for k, v in local.nodeset_dyn : v.nodeset_name]) - x_nodeset_tpu = toset([for k, v in local.nodeset_tpu : v.nodeset_name]) + x_nodeset = toset(var.nodeset[*].nodeset_name) + x_nodeset_dyn = toset(var.nodeset_dyn[*].nodeset_name) + x_nodeset_tpu = toset(var.nodeset_tpu[*].nodeset.nodeset_name) x_nodeset_overlap = setintersection([], local.x_nodeset, local.x_nodeset_dyn, local.x_nodeset_tpu) etc_dir = abspath("${path.module}/etc") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 84ab6fa056..bc0a57a486 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -313,7 +313,7 @@ variable "partitions" { validation { condition = alltrue([ - for x in var.partitions[*].partition : can(regex("^[a-z](?:[a-z0-9]*)$", x.partition_name)) + for n in var.partitions[*].partition_name : can(regex("^[a-z](?:[a-z0-9]*)$", n)) ]) error_message = "Items 'partition_name' must be a match of regex '^[a-z](?:[a-z0-9]*)$'." } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index bd4b9486b9..6084741e97 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -24,7 +24,7 @@ locals { } # NODESET -# TODO: remove dependency on slurm-gcp repo, move to local nodeset module +# TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" for_each = local.nodeset_map @@ -66,23 +66,22 @@ module "slurm_nodeset_template" { tags = concat([local.slurm_cluster_name], each.value.tags) } -module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.6.0" - for_each = local.nodeset_map - - instance_template_self_link = module.slurm_nodeset_template[each.key].self_link - - enable_placement = each.value.enable_placement - maintenance_interval = each.value.maintenance_interval - network_tier = each.value.network_tier - node_count_dynamic_max = each.value.node_count_dynamic_max - node_count_static = each.value.node_count_static - nodeset_name = each.value.nodeset_name - node_conf = each.value.node_conf - subnetwork_self_link = each.value.subnetwork_self_link - zones = each.value.zones - zone_target_shape = each.value.zone_target_shape - reservation_name = each.value.reservation_name +locals { + nodesets = [for name, ns in local.nodeset_map : { + nodeset_name = ns.nodeset_name + node_conf = ns.node_conf + instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link + node_count_dynamic_max = ns.node_count_dynamic_max + node_count_static = ns.node_count_static + subnetwork = ns.subnetwork_self_link + reservation_name = ns.reservation_name + maintenance_interval = ns.maintenance_interval + enable_placement = ns.enable_placement + network_storage = ns.network_storage + zone_target_shape = ns.zone_target_shape + zone_policy_allow = ns.zone_policy_allow + zone_policy_deny = ns.zone_policy_deny + }] } # NODESET TPU diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index b3a8e27987..c8a8eb8a1c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -176,10 +176,13 @@ module "slurm_files" { ] login_network_storage = var.login_network_storage - partitions = [for p in var.partitions : { partition : p }] - nodeset = values(module.slurm_nodeset)[*] + partitions = var.partitions + + nodeset = local.nodesets + nodeset_dyn = values(local.nodeset_dyn_map) + # Use legacy format for now nodeset_tpu = values(module.slurm_nodeset_tpu)[*] - nodeset_dyn = [for ns in values(local.nodeset_dyn_map) : { nodeset : ns }] + depends_on = [module.bucket] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 2a7cef0009..73a6951164 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -268,12 +268,14 @@ variable "nodeset" { spot = optional(bool, false) tags = optional(list(string), []) termination_action = optional(string) - zones = optional(list(string), []) - zone_target_shape = optional(string, "ANY_SINGLE_ZONE") reservation_name = optional(string) startup_script = optional(list(object({ filename = string content = string })), []) + + zone_target_shape = optional(string, "ANY_SINGLE_ZONE") + zone_policy_allow = optional(set(string), []) + zone_policy_deny = optional(set(string), []) })) default = [] } From 7c8fcc734d48f8f3e80177e5a67e4d7b5e1d7adb Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 31 Jul 2024 13:57:05 -0700 Subject: [PATCH 082/118] Point to the moved repo --- tools/cloud-build/provision/README.md | 2 +- tools/cloud-build/provision/pr-go-build-test.tf | 2 +- tools/cloud-build/provision/pr-ofe-test.tf | 2 +- tools/cloud-build/provision/pr-ofe.tf | 2 +- tools/cloud-build/provision/pr-tests.tf | 2 +- tools/cloud-build/provision/pr-validation.tf | 2 +- tools/cloud-build/provision/variables.tf | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/cloud-build/provision/README.md b/tools/cloud-build/provision/README.md index da2dd7acd8..946d954dd7 100644 --- a/tools/cloud-build/provision/README.md +++ b/tools/cloud-build/provision/README.md @@ -66,7 +66,7 @@ When prompted for project, use integration test project. |------|-------------|------|---------|:--------:| | [project\_id](#input\_project\_id) | GCP project ID | `string` | `"hpc-toolkit-dev"` | no | | [region](#input\_region) | GCP region | `string` | `"us-central1"` | no | -| [repo\_uri](#input\_repo\_uri) | URI of GitHub repo | `string` | `"https://github.com/GoogleCloudPlatform/hpc-toolkit"` | no | +| [repo\_uri](#input\_repo\_uri) | URI of GitHub repo | `string` | `"https://github.com/GoogleCloudPlatform/cluster-toolkit"` | no | | [zone](#input\_zone) | GCP zone | `string` | `"us-central1-c"` | no | ## Outputs diff --git a/tools/cloud-build/provision/pr-go-build-test.tf b/tools/cloud-build/provision/pr-go-build-test.tf index 9b6f642958..32da3ff0ed 100644 --- a/tools/cloud-build/provision/pr-go-build-test.tf +++ b/tools/cloud-build/provision/pr-go-build-test.tf @@ -29,7 +29,7 @@ resource "google_cloudbuild_trigger" "pr_go_build_test" { github { owner = "GoogleCloudPlatform" - name = "hpc-toolkit" + name = "cluster-toolkit" pull_request { branch = ".*" comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" diff --git a/tools/cloud-build/provision/pr-ofe-test.tf b/tools/cloud-build/provision/pr-ofe-test.tf index 04a4cb33bc..d0d4531f44 100644 --- a/tools/cloud-build/provision/pr-ofe-test.tf +++ b/tools/cloud-build/provision/pr-ofe-test.tf @@ -26,7 +26,7 @@ resource "google_cloudbuild_trigger" "pr_ofe_test" { github { owner = "GoogleCloudPlatform" - name = "hpc-toolkit" + name = "cluster-toolkit" pull_request { branch = ".*" comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" diff --git a/tools/cloud-build/provision/pr-ofe.tf b/tools/cloud-build/provision/pr-ofe.tf index 6663f265a6..32e34c210f 100644 --- a/tools/cloud-build/provision/pr-ofe.tf +++ b/tools/cloud-build/provision/pr-ofe.tf @@ -20,7 +20,7 @@ resource "google_cloudbuild_trigger" "pr_ofe_venv" { github { owner = "GoogleCloudPlatform" - name = "hpc-toolkit" + name = "cluster-toolkit" pull_request { branch = ".*" comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" diff --git a/tools/cloud-build/provision/pr-tests.tf b/tools/cloud-build/provision/pr-tests.tf index 892723f598..cd7b322fd4 100644 --- a/tools/cloud-build/provision/pr-tests.tf +++ b/tools/cloud-build/provision/pr-tests.tf @@ -24,7 +24,7 @@ resource "google_cloudbuild_trigger" "pr_test" { github { owner = "GoogleCloudPlatform" - name = "hpc-toolkit" + name = "cluster-toolkit" pull_request { branch = ".*" comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" diff --git a/tools/cloud-build/provision/pr-validation.tf b/tools/cloud-build/provision/pr-validation.tf index c8f3677f55..93c5eb049d 100644 --- a/tools/cloud-build/provision/pr-validation.tf +++ b/tools/cloud-build/provision/pr-validation.tf @@ -20,7 +20,7 @@ resource "google_cloudbuild_trigger" "pr_validation" { github { owner = "GoogleCloudPlatform" - name = "hpc-toolkit" + name = "cluster-toolkit" pull_request { branch = ".*" comment_control = "COMMENTS_ENABLED_FOR_EXTERNAL_CONTRIBUTORS_ONLY" diff --git a/tools/cloud-build/provision/variables.tf b/tools/cloud-build/provision/variables.tf index fba322aa71..a67fbfb77d 100644 --- a/tools/cloud-build/provision/variables.tf +++ b/tools/cloud-build/provision/variables.tf @@ -35,5 +35,5 @@ variable "zone" { variable "repo_uri" { description = "URI of GitHub repo" type = string - default = "https://github.com/GoogleCloudPlatform/hpc-toolkit" + default = "https://github.com/GoogleCloudPlatform/cluster-toolkit" } From f577018c50299fdd2e4e99f5aba7a1efcbbb6c4b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 1 Aug 2024 10:52:08 -0700 Subject: [PATCH 083/118] Improve error handling in install-daos-client.sh script --- .../file-system/parallelstore/scripts/install-daos-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index 6fe3da1d41..e59f2cfeb9 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -e +set -e -o pipefail # Parse access_points. for arg in "$@"; do From a2db4819302561336024f064f5f126d86bd750e1 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 1 Aug 2024 10:52:52 -0700 Subject: [PATCH 084/118] Improve error handling in pre-existing-ns install-daos-client.sh script --- .../pre-existing-network-storage/scripts/install-daos-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index 6fe3da1d41..e59f2cfeb9 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set -e +set -e -o pipefail # Parse access_points. for arg in "$@"; do From 670cd4130a6f76c875c92924214f215c4ce0d89c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 31 Jul 2024 21:01:02 +0000 Subject: [PATCH 085/118] Add `instance_properties` var to `nodeset` --- .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../compute/schedmd-slurm-gcp-v6-nodeset/main.tf | 1 + .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 13 +++++++++++++ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 3 +++ .../schedmd-slurm-gcp-v6-controller/partition.tf | 1 + .../schedmd-slurm-gcp-v6-controller/variables.tf | 1 + 7 files changed, 21 insertions(+), 1 deletion(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 6559121e1f..874551c5ea 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -175,6 +175,7 @@ No modules. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index ff66192090..1553866557 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -90,6 +90,7 @@ locals { termination_action = try(var.spot_instance_config.termination_action, null) reservation_name = local.reservation_name maintenance_interval = var.maintenance_interval + instance_properties = var.instance_properties zone_target_shape = var.zone_target_shape zone_policy_allow = toset(concat([var.zone], tolist(var.zones))) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 3abfdca56e..b4c0749759 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -480,3 +480,16 @@ variable "network_storage" { })) default = [] } + + +variable "instance_properties" { + description = <<-EOD + Override the instance properties. Used to test features not supported by Slurm GCP, + recommended for advanced usage only. + See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert + If any sub-field (e.g. scheduling) is set, it will override the values computed by + SlurmGCP and ignoring values of provided vars. + EOD + type = any + default = null +} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 7e130718af..d258413fdd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -272,7 +272,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
zone_policy_allow = optional(set(string), [])
zone_policy_deny = optional(set(string), [])
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties = optional(any, null)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
zone_policy_allow = optional(set(string), [])
zone_policy_deny = optional(set(string), [])
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 242eb84a23..649228d09f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -118,6 +118,9 @@ def instance_properties(nodeset, model, placement_group, labels=None): props.scheduling = props.scheduling or {} props.scheduling["maintenanceInterval"] = nodeset.maintenance_interval + # Override with properties explicit specified in the nodeset + props.update(nodeset.get("instance_properties", {})) + return props diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 6084741e97..23a9f414ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -76,6 +76,7 @@ locals { subnetwork = ns.subnetwork_self_link reservation_name = ns.reservation_name maintenance_interval = ns.maintenance_interval + instance_properties = ns.instance_properties enable_placement = ns.enable_placement network_storage = ns.network_storage zone_target_shape = ns.zone_target_shape diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 73a6951164..90377d78cb 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -213,6 +213,7 @@ variable "nodeset" { labels = optional(map(string), {}) machine_type = optional(string) maintenance_interval = optional(string) + instance_properties = optional(any, null) metadata = optional(map(string), {}) min_cpu_platform = optional(string) network_tier = optional(string, "STANDARD") From ec524e07a4e3c8312924e56378ae33fa76aac815 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 2 Aug 2024 10:27:00 +0000 Subject: [PATCH 086/118] note about H3 availability --- examples/hpc-enterprise-slurm.yaml | 1 + examples/hpc-slurm.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index bea609e2cb..21dc9e15f9 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -238,6 +238,7 @@ deployment_groups: use: [network, compute_sa] settings: node_count_dynamic_max: 16 + # Note that H3 is available in only specific zones. https://cloud.google.com/compute/docs/regions-zones machine_type: h3-standard-88 bandwidth_tier: gvnic_enabled # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_network instance_image: $(vars.slurm_image) diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index a905b08189..c425c041df 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -76,6 +76,7 @@ deployment_groups: use: [network] settings: node_count_dynamic_max: 20 + # Note that H3 is available in only specific zones. https://cloud.google.com/compute/docs/regions-zones machine_type: h3-standard-88 # H3 does not support pd-ssd and pd-standard # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks From ea81ac9cef7b5f05fa0fa5d97df02f18d7e3f88b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Aug 2024 14:36:41 +0000 Subject: [PATCH 087/118] Fix application of `nodeset.instance_properties` --- .../modules/slurm_files/scripts/resume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 649228d09f..0f3f97c0cf 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -119,7 +119,7 @@ def instance_properties(nodeset, model, placement_group, labels=None): props.scheduling["maintenanceInterval"] = nodeset.maintenance_interval # Override with properties explicit specified in the nodeset - props.update(nodeset.get("instance_properties", {})) + props.update(nodeset.get("instance_properties") or {}) return props From 0571bb2f0e1bb9b5720cb2c94ef1804661179291 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Aug 2024 14:54:51 +0000 Subject: [PATCH 088/118] Stop checking `SuspenExcStates` in `slurmsync` --- .../modules/slurm_files/scripts/slurmsync.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 993156cd7b..9d67314c68 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -167,19 +167,6 @@ def _find_tpu_node_status(nodename, state): return NodeStatus.unchanged - -def allow_power_down(state): - config = run(f"{lkp.scontrol} show config").stdout.rstrip() - m = re.search(r"SuspendExcStates\s+=\s+(?P[\w\(\)]+)", config) - if not m: - log.warning("SuspendExcStates not found in Slurm config") - return True - states = set(m.group("states").split(",")) - if "(null)" in states or bool(state & state.flags.union(state.base)): - return False - return True - - def find_node_status(nodename): """Determine node/instance status that requires action""" state = lkp.slurm_node(nodename) @@ -207,7 +194,7 @@ def find_node_status(nodename): return NodeStatus.unbacked if state.base != "DOWN" and not power_flags: return NodeStatus.unbacked - if state.base == "DOWN" and not power_flags and allow_power_down(state): + if state.base == "DOWN" and not power_flags: return NodeStatus.power_down if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename): return NodeStatus.resume From 3fec0904e811306dff34525abfa818eb4bc3c859 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Aug 2024 10:09:46 -0500 Subject: [PATCH 089/118] Update Slurm-GCP 5.11.1 links to 5.12.0 --- .../schedmd-slurm-gcp-v5-controller/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-hybrid/README.md | 12 ++++++------ .../scheduler/schedmd-slurm-gcp-v5-login/README.md | 10 +++++----- examples/README.md | 4 ++-- examples/machine-learning/a3-highgpu-8g/README.md | 2 +- modules/README.md | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index d15b8ac45d..7a481df2b3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster#optional ## Custom Images diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 86060e03c1..8e899dbd3b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/docs/hybrid.md -[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.11.1/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer ## License diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index b5881bde7b..070f9a457d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -46,8 +46,8 @@ The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terrafo modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0#slurm-on-google-cloud-platform ## License diff --git a/examples/README.md b/examples/README.md index 225fbcac9d..b10c2c7b74 100644 --- a/examples/README.md +++ b/examples/README.md @@ -217,7 +217,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -1150,7 +1150,7 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt > ``` Similar to the [hpc-slurm-v5-legacy.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md index 86aa5232be..e4a0ad6bc4 100644 --- a/examples/machine-learning/a3-highgpu-8g/README.md +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -32,7 +32,7 @@ installing them in a Python virtual environment: python3 -m venv toolkit-a3 source toolkit-a3/bin/activate pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.11.1/scripts/requirements.txt + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt ``` **Always** activate the environment before running any gcluster commands such as diff --git a/modules/README.md b/modules/README.md index e4e44f91d2..3796a66b67 100644 --- a/modules/README.md +++ b/modules/README.md @@ -227,7 +227,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 [slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md From b1586c527841986964a430ce13795a95d3b81cb4 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Aug 2024 20:59:24 +0000 Subject: [PATCH 090/118] SlurmGCP. Use config-independent storage client. --- .../slurm_files/scripts/tests/test_util.py | 1 - .../modules/slurm_files/scripts/util.py | 47 +++++++++---------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index cf255669aa..f74804250a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -120,7 +120,6 @@ def test_to_hostlist_fast(names, expected): "alpha", ClientOptions(api_endpoint="https://tpu.googleapis.com/alpha/"), ), - (None, None, ClientOptions()), ], ) def test_create_client_options( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 626c052223..b5d0beecf6 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -161,9 +161,7 @@ def universe_domain() -> str: def endpoint_version(api: ApiEndpoint) -> Optional[str]: - if api and api.value in lkp.endpoint_versions: - return lkp.endpoint_versions[api.value] - return None + return lkp.endpoint_versions.get(api.value, None) @lru_cache(maxsize=1) @@ -180,17 +178,17 @@ def get_credentials() -> Optional[service_account.Credentials]: return credentials -def create_client_options(api: Optional[ApiEndpoint] = None) -> ClientOptions: +def create_client_options(api: ApiEndpoint) -> ClientOptions: """Create client options for cloud endpoints""" ver = endpoint_version(api) ud = universe_domain() options = {} if ud and ud != DEFAULT_UNIVERSE_DOMAIN: options["universe_domain"] = ud - if api and ver: + if ver: options["api_endpoint"] = f"https://{api.value}.{ud}/{ver}/" co = ClientOptions(**options) - log.debug(f"Using ClientOptions = {co} for API: {api}") + log.debug(f"Using ClientOptions = {co} for API: {api.value}") return co logging.basicConfig(level=logging.INFO, stream=sys.stdout) @@ -275,27 +273,19 @@ def map_with_futures(func, seq): yield res -def blob_get(file, project=None): - if project is None: - project = lkp.project +def blob_get(file): uri = instance_metadata("attributes/slurm_bucket_path") bucket_name, path = parse_bucket_uri(uri) blob_name = f"{path}/{file}" - co = create_client_options(ApiEndpoint.STORAGE) - storage_client = storage.Client(project=project, client_options=co) - return storage_client.get_bucket(bucket_name).blob(blob_name) + return storage_client().get_bucket(bucket_name).blob(blob_name) -def blob_list(prefix="", delimiter=None, project=None): - if project is None: - project = lkp.project +def blob_list(prefix="", delimiter=None): uri = instance_metadata("attributes/slurm_bucket_path") bucket_name, path = parse_bucket_uri(uri) blob_prefix = f"{path}/{prefix}" - co = create_client_options(ApiEndpoint.STORAGE) - storage_client = storage.Client(project=project, client_options=co) # Note: The call returns a response only when the iterator is consumed. - blobs = storage_client.list_blobs( + blobs = storage_client().list_blobs( bucket_name, prefix=blob_prefix, delimiter=delimiter ) return [blob for blob in blobs] @@ -371,17 +361,14 @@ def reservation_resource_policies(reservation): return [u.split("/")[-1] for u in reservation.get("resourcePolicies", {}).values()] -def compute_service(credentials=None, user_agent=USER_AGENT, version="beta"): +def compute_service(version="beta"): """Make thread-safe compute service handle creates a new Http for each request """ - credentials = get_credentials() def build_request(http, *args, **kwargs): - new_http = httplib2.Http() - if user_agent is not None: - new_http = set_user_agent(new_http, user_agent) + new_http = set_user_agent(httplib2.Http(), USER_AGENT) if credentials is not None: new_http = google_auth_httplib2.AuthorizedHttp(credentials, http=new_http) return googleapiclient.http.HttpRequest(new_http, *args, **kwargs) @@ -401,6 +388,16 @@ def build_request(http, *args, **kwargs): discoveryServiceUrl=disc_url, ) +def storage_client() -> storage.Client: + """ + Config-independent storage client + """ + ud = universe_domain() + co = {} + if ud and ud != DEFAULT_UNIVERSE_DOMAIN: + co["universe_domain"] = ud + return storage.Client(client_options=ClientOptions(**co)) + def load_config_data(config): """load dict-like data into a config object""" @@ -823,9 +820,7 @@ def project_metadata(key): def bucket_blob_download(bucket_name, blob_name): - co = create_client_options("storage") - storage_client = storage.Client(client_options=co) - bucket = storage_client.bucket(bucket_name) + bucket = storage_client().bucket(bucket_name) blob = bucket.blob(blob_name) contents = None with tempfile.NamedTemporaryFile(mode="w+t") as tmp: From 7e359950ffa0079052e680f6a541ceec88b46f53 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Aug 2024 21:14:05 +0000 Subject: [PATCH 091/118] Enable local SSD formatting solution to set POSIX permissions --- modules/scripts/startup-script/README.md | 2 +- modules/scripts/startup-script/main.tf | 1 + modules/scripts/startup-script/variables.tf | 15 +++++++++++---- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index d4d8cd87db..dbfb3f8e4c 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -298,7 +298,7 @@ No modules. | [install\_docker](#input\_install\_docker) | Install Docker command line tool and daemon. | `bool` | `false` | no | | [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. |
object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
})
|
{
"fs_type": "ext4",
"mountpoint": ""
}
| no | +| [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. |
object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
permissions = optional(string, "0755")
})
|
{
"fs_type": "ext4",
"mountpoint": "",
"permissions": "0755"
}
| no | | [prepend\_ansible\_installer](#input\_prepend\_ansible\_installer) | DEPRECATED. Use `install_ansible=false` to prevent ansible installation. | `bool` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region to deploy to | `string` | n/a | yes | diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 25e403dc09..8a6c1dd6cb 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -107,6 +107,7 @@ locals { args = join(" ", [ "-e mountpoint=${var.local_ssd_filesystem.mountpoint}", "-e fs_type=${var.local_ssd_filesystem.fs_type}", + "-e mode=${var.local_ssd_filesystem.permissions}", ]) }, ] diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 2b78d96c06..3975a69614 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -129,8 +129,9 @@ variable "install_docker" { variable "local_ssd_filesystem" { description = "Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path." type = object({ - fs_type = optional(string, "ext4") - mountpoint = optional(string, "") + fs_type = optional(string, "ext4") + mountpoint = optional(string, "") + permissions = optional(string, "0755") }) validation { @@ -143,9 +144,15 @@ variable "local_ssd_filesystem" { error_message = "To enable local SSD filesystems, var.local_ssd_filesystem.mountpoint must be set to an absolute path to a mountpoint." } + validation { + condition = length(regexall("^[0-7]{3,4}$", var.local_ssd_filesystem.permissions)) > 0 + error_message = "The POSIX permissions for the mountpoint must be represented as a 3 or 4-digit octal" + } + default = { - fs_type = "ext4" - mountpoint = "" + fs_type = "ext4" + mountpoint = "" + permissions = "0755" } nullable = false From dca4b923f4d3a1bdd525f3ce265a814b46636590 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Aug 2024 21:27:58 +0000 Subject: [PATCH 092/118] Move a3-highgpu-8g v5 solution to legacy subdirectory --- examples/machine-learning/a3-highgpu-8g/{ => v5-legacy}/README.md | 0 .../ml-slurm-a3-0-base-v5-legacy.yaml} | 0 .../ml-slurm-a3-1-image-v5-legacy.yaml} | 0 .../ml-slurm-a3-2-cluster-v5-legacy.yaml} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename examples/machine-learning/a3-highgpu-8g/{ => v5-legacy}/README.md (100%) rename examples/machine-learning/a3-highgpu-8g/{ml-slurm-a3-0-base.yaml => v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml} (100%) rename examples/machine-learning/a3-highgpu-8g/{ml-slurm-a3-1-image.yaml => v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml} (100%) rename examples/machine-learning/a3-highgpu-8g/{ml-slurm-a3-2-cluster.yaml => v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml} (100%) diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md similarity index 100% rename from examples/machine-learning/a3-highgpu-8g/README.md rename to examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml similarity index 100% rename from examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml rename to examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-0-base-v5-legacy.yaml diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml similarity index 100% rename from examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml rename to examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml similarity index 100% rename from examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml rename to examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml From 04c764b3d977fdabbbb554915550f9165beaeefa Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Aug 2024 21:27:58 +0000 Subject: [PATCH 093/118] Update a3-highgpu-8g README - include legacy warnings and instructions --- .../a3-highgpu-8g/v5-legacy/README.md | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md index e4a0ad6bc4..4a09fd81ed 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md @@ -1,5 +1,12 @@ # Objective +> [!CAUTION] +> This solution is built upon "legacy" blueprints using Slurm-GCP v5. The +> [solution using v6](../README.md) is recommended for all new deployments. +> The legacy solution is presented for customers with existing deployments. We +> recommend maintaining existing deployments with v1.37.0 of the Toolkit as this +> combination is tested nightly. + This document will guide you to successfully provisioning a Slurm cluster with a3-highgpu-8g compute nodes running NVIDIA H100 GPUs. @@ -19,7 +26,8 @@ Please follow the initial instructions for: - Installing Cluster Toolkit [dependencies][tkdeps] (Go, Terraform, Packer) - Installing the Cluster [Toolkit][tkinstall] -Verify that your release of the Cluster Toolkit is 1.31.1 or later. +Verify that your release of the Cluster Toolkit is greater than 1.31.1 and less +than or equal to 1.37.0. ```shell gcluster --version @@ -46,11 +54,11 @@ source /absolute/path/to/toolkit-a3/bin/activate The solution is split into 3 Cluster Toolkit blueprints: -1. Provision 5 VPCs (1 system network, 4 GPU networks) and 1 Filestore for -mounting `/home` across the cluster +1. Provision 1 system network and 1 Filestore instance for mounting `/home` +across the cluster. 2. Build a custom image installing Slurm in an Ubuntu 20.04 image. The image runs a kernel patched with performance enhancements for the a3-highgpu-8g VM. -3. Provision a Slurm cluster using the custom image +3. Provision 4 GPU networks and a Slurm cluster using the custom image. The 1st and 2nd blueprints should be provisioned once and rarely need further modification. This approach separates the lifecycle of a Filestore instance from @@ -189,9 +197,9 @@ size. Recall that there are 8 NVIDIA H100 GPUs per a3-highgpu-8g VM. ## Cluster creation -The blueprint `ml-slurm-a3-0-base.yaml` will create 5 VPCs (1 system, 4 GPU) -and a Filestore `/home` filesystem. Run the standard Toolkit workflow at the -command line (approx. 5 minutes): +The blueprint `ml-slurm-a3-0-base.yaml` will create 1 system network and a +Filestore `/home` filesystem. Run the standard Toolkit workflow at the command +line (approx. 5 minutes): ```shell gcluster deploy ml-slurm-a3-0-base.yaml --auto-approve From f2b10c935af2ea01bf5c85614ee161e580ab202a Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 2 Aug 2024 21:27:58 +0000 Subject: [PATCH 094/118] Add Slurm-GCP v6 solution for provisioning a3-highgpu-8g cluster --- .../machine-learning/a3-highgpu-8g/README.md | 360 ++++++++++++++++++ .../a3-highgpu-8g/ml-slurm-a3-0-base.yaml | 61 +++ .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 231 +++++++++++ .../a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml | 257 +++++++++++++ .../tests/ml-a3-highgpu-slurm-cluster.yml | 2 - 5 files changed, 909 insertions(+), 2 deletions(-) create mode 100644 examples/machine-learning/a3-highgpu-8g/README.md create mode 100644 examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml create mode 100644 examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml create mode 100644 examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md new file mode 100644 index 0000000000..38a967fbe5 --- /dev/null +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -0,0 +1,360 @@ +# Objective + +This document will guide you to successfully provisioning a Slurm cluster with +a3-highgpu-8g compute nodes running NVIDIA H100 GPUs. + +## Before starting + +> [!IMPORTANT] +> Before beginning, submit a request to your Google Cloud representative for +> access to the Deep Learning VM Image for a3-highgpu-8g. It is currently +> available only by Private Preview request. This image contains patches that +> significantly enhance the network performance of workloads that span multiple +> a3-highgpu-8g VMs. You will use the image ID in the steps shown below. + +## Upgrading from the v5 "legacy" solution to v6 +There is no direct path for upgrading the Slurm-GCP v5 solution in-place to v6. +The recommended path requires temporarily bringing down your v5 cluster and +replacing it with the v6 solution described in this document. + +> [!NOTE] +> The `ml-slurm-a3-0-base.yaml` blueprint is identical for the "legacy" v5 and +> v6 solutions. If you are upgrading from v5 to v6, do not destroy the v5 base +> blueprint or re-deploy the v6 base blueprint. Simply copy the Filestore IP +> address as instructed below. + +We recommend using `gcluster destroy` to destroy the deployments provisioned by the +v5 legacy blueprints: + +- [Legacy v5 image building blueprint](v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml) +- [Legacy v5 cluster provisioning blueprint](v5-legacy/ml-slurm-a3-2-cluster-v5-legacy.yaml) + +Then follow the instructions below while skipping the re-deployment of the base +blueprint. + +## Required setup + +Please follow the initial instructions for: + +- Installing Cluster Toolkit [dependencies][tkdeps] (Go, Terraform, Packer) +- Installing the Cluster [Toolkit][tkinstall] + +Verify that your release of the Cluster Toolkit is 1.37.0 or later. + +```shell +gcluster --version +``` + +The solution requires several Python packages to be available. We recommend +installing them in a Python virtual environment: + +```shell +python3 -m venv toolkit-a3 +source toolkit-a3/bin/activate +pip3 install -r \ + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.5.13/scripts/requirements.txt +``` + +**Always** activate the environment before running any gcluster commands such as +deploy or destroy. + +```shell +source /absolute/path/to/toolkit-a3/bin/activate +``` + +## Top-Level Design of Solution + +The solution is split into 3 Cluster Toolkit blueprints: + +1. Provision 1 system network and 1 Filestore instance for mounting `/home` +across the cluster. +2. Build a custom image installing Slurm in an Ubuntu 20.04 image. The image +runs a kernel patched with performance enhancements for the a3-highgpu-8g VM. +3. Provision 4 GPU networks and a Slurm cluster using the custom image. + +The 1st and 2nd blueprints should be provisioned once and rarely need further +modification. This approach separates the lifecycle of a Filestore instance from +the lifecycle of the cluster, allowing the cluster to be deleted while retaining +access to data and home directories. The 3rd cluster blueprint may be more +frequently updated and re-provisioned as discussed below. + +## First time considerations + +> [!IMPORTANT] +> These steps do not need to be repeated when a cluster is re-provisioned. They +> are initial setup steps in a project. + +Replace the values for `PROJECT_ID`, `REGION`, and `ZONE` with the project, +region, and zone in which you have an a3-highgpu-8g allocation. The value for +`BUCKET` must be unique and will be used to create a new bucket. After replacing +the values, execute them so that they automatically populate parameters in the +commands shown below. Note that each a3-highgpu-8g VM (`N_VMS`) contains 8 NVIDIA +H100 GPUs. + +```shell +export PROJECT_ID=customer-project-id +export BUCKET=customer-bucket +export REGION=customer-region +export ZONE=customer-zone +export N_VMS=32 +``` + +### Saving Terraform state +Create a bucket with versioning enabled to store Terraform state: + +```shell +gcloud storage buckets create gs://${BUCKET} --project=${PROJECT_ID} \ + --default-storage-class=STANDARD --location=${REGION} \ + --uniform-bucket-level-access +gcloud storage buckets update gs://${BUCKET} --versioning +``` + +Modify all 3 blueprints to configure the new bucket to serve as a Terraform +remote backend: + +```yaml +terraform_backend_defaults: + type: gcs + configuration: + bucket: customer-bucket # modify to bucket created above +``` + +### Set default values + +Modify the the deployment variables `project_id`, `region`, `zone`, in the +`vars` block of all 3 blueprints: + +```yaml + project_id: customer-project + region: customer-region + zone: customer-zone +``` + +### Set kernel-patched OS image + +Obtain values for `source_image_project_id` and `source_image` from your Google +Cloud representative. Set them at approximately lines 33 and 34 of +`ml-slurm-a3-1-image.yaml`. + +```yaml + source_image_project_id: source-image-project-id # use value supplied by Google Cloud staff + source_image: source-image-name # use value supplied by Google Cloud staff +``` + +### Reservation created by Google + +> [!IMPORTANT] +> If you have ***not*** received a VM reservation from Google Cloud staff, then +> skip this step and proceed to [manual reservation creation](#manual-creation-of-reservation). + +Set the deployment variable `a3_reservation_name` at approximately line 38 of +`ml-slurm-a3-2-cluster.yaml` to the reservation name provided by Google. The +value for `a3_maintenance_interval` should also be set as directed by Google +staff. A common setting is `PERIODIC`, shown below, but this value must be +confirmed with Google staff. + +```yaml + # a3_reservation_name must be specified; if Google staff have provided you + # with a reservation name, use it. Otherwise supply user-created reservation. + a3_reservation_name: reservation-name-provided-by-google + # a3_maintenance_interval should be empty string by default; if Google staff + # have created a reservation, they will also provide a3_maintenance_interval + a3_maintenance_interval: PERIODIC +``` + +### Manual creation of reservation + +> [!IMPORTANT] +> If you received a VM reservation from Google Cloud staff, then skip this step +> after confirming that you followed the instructions in [reservation created by +> Google](#reservation-created-by-google). + +We recommend creating a reservation to ensure reliable access to re-create VMs +if you need to redeploy or otherwise maintain your cluster. + +```shell +gcloud compute reservations create a3-reservation-0 \ + --project=${PROJECT_ID} \ + --machine-type=a3-highgpu-8g \ + --vm-count=${N_VMS} \ + --zone=${ZONE} \ + --require-specific-reservation \ + --log-http +``` + +This reservation be must be specified when creating VMs with matching parameters +(e.g. a3-highgpu-8g VM in configured zone). If you executed the command above +without modification, you may leave `a3_reservation_name` and +`a3_maintenance_interval` at their default values in +`ml-slurm-a3-2-cluster.yaml`. Otherwise, ensure that the reservation name in the +blueprint matches the name of the user-created reservation. + +```yaml + # a3_reservation_name must be specified; if Google staff have provided you + # with a reservation name, use it. Otherwise supply user-created reservation. + a3_reservation_name: a3-reservation-0 + # a3_maintenance_interval should be empty string by default; if Google staff + # have created a reservation, they will also provide a3_maintenance_interval + a3_maintenance_interval: "" +``` + +### Set cluster size + +At approximately line 37 of `ml-slurm-a3-2-cluster.yaml`, set the static cluster +size. Recall that there are 8 NVIDIA H100 GPUs per a3-highgpu-8g VM. + +```yaml + a3_static_cluster_size: 32 +``` + +## Cluster creation + +> [!NOTE] +> The `ml-slurm-a3-0-base.yaml` blueprint is identical for the "legacy" v5 and +> v6 solutions. If you are upgrading from v5 to v6, do not destroy the v5 base +> blueprint or re-deploy the v6 base blueprint. Simply copy the Filestore IP +> address as instructed below. + +The blueprint `ml-slurm-a3-0-base.yaml` will create 1 system network and a +Filestore `/home` filesystem. Run the standard Toolkit workflow at the command +line (approx. 5 minutes): + +```shell +gcluster deploy ml-slurm-a3-0-base.yaml --auto-approve +``` + +Several values will be output to the screen. The output will be similar to: + +```hcl +network_name_sysnet = "sys-net" +network_storage_homefs = { + "client_install_runner" = { + "destination" = "install-nfs_home.sh" + "source" = "modules/embedded/modules/file-system/filestore/scripts/install-nfs-client.sh" + "type" = "shell" + } + "fs_type" = "nfs" + "local_mount" = "/home" + "mount_options" = "defaults,_netdev" + "mount_runner" = { + "args" = "\"10.224.153.226\" \"/nfsshare\" \"/home\" \"nfs\" \"defaults,_netdev\"" + "destination" = "mount_home.sh" + "source" = "modules/embedded/modules/file-system/filestore/scripts/mount.sh" + "type" = "shell" + } + "remote_mount" = "/nfsshare" + "server_ip" = "10.224.153.226" +} +subnetwork_name_sysnet = "sys-subnet" +``` + +Build the custom image using ml-slurm-a3-1-image.yaml and the same workflow +as above. Run at the command line: + +```shell +gcluster deploy ml-slurm-a3-1-image.yaml --auto-approve +``` + +The image will take approximately 30 minutes to build. + +> [!IMPORTANT] +> You must modify `ml-slurm-a3-2-cluster.yaml` to update the IP address of the +> Filestore instance for `/home`. Your IP address will differ from that shown +> below and must match the output from deploying the base blueprint above: +> +> ```yaml +> server_ip_homefs: 10.224.153.226 +> ``` + +Provision the cluster blueprint (approximately 5-10 minutes): + +```shell +gcluster deploy ml-slurm-a3-2-cluster.yaml --auto-approve +``` + +## Receive Data Path Manager (RxDM) + +To achieve optimal application performance, an additional service called the +"Receive Data Path Manager" (RxDM) must run with the same lifetime as the job. +Additionally, a NCCL plugin must be installed into the execution environment of +the workload. Both the RxDM and plugin are distributed by Docker container +images. + +This blueprint includes a Slurm "Prolog" and "Epilog" script that will run +before and after every job running on more than 1 a3-highgpu-8g compute node. +The Prolog will perform the following actions: + +- Install the NCCL plugin into /var/lib of the host +- Run the RxDM service + - This is a long-lived service that runs alongside the job + - Mounts `/var/lib/nvidia/lib64` into `/usr/lib/nvidia/lib64` of the container + - Mount `/opt/tcpdirect_benchmark/` from the host into the container so that a + textproto file defining the mapping from GPU to NIC is available. This file + is present in the Deep Learning VM (DLVM) images that contain TCPDirect + patches. + - Mount `/run/tcpx-${SLURM_JOB_ID}` from the container into the host. This is + set to the environment variables `${UDS_PATH}` in the script. This directory + contains Unix socket files that implement a TCPx interface available to the + user workload at `${UDS_PATH}`. The job must be configured to be aware of this + path using `NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX` environment variable! + +The Epilog will + +- Stop the RxDM service +- Prune any stopped containers (freeing up disk space) +- Remove the directory at `${UDS_PATH}` + +## Jobs using the RxDM / TCPx + +Jobs that are running across multiple a3-highgpu-8g VMs will benefit from using +the RxDM and the NCCL plugin. An example containerized job is located at +`/opt/apps/scripts/run-nccl-tests.sh`. In addition to setting standard NCCL +configuration values, a job must: + +- Set `NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX` to `${UDS_PATH}` +- Set the `LD_LIBRARY_PATH` to include `/var/lib/tcpx/lib64` and `/usr/local/nvidia/lib64` + +If job is containerized + +- Mount `${UDS_PATH}` into the container at the same path +- Mount `/var/lib/tcpx/lib64` to `/var/lib/tcpx/lib64` in the container (to make the + NCCL plugin available) +- Paths can be modified if `LD_LIBRARY_PATH` is likewise modified + +## Example workload (NCCL benchmark) + +The example workload below demonstrates the pattern recommended in Activating +the Receive Data Path Manager during jobs while running the standard nccl-tests +benchmark. It assumes the availability of a GPU/NIC topology file at +`/opt/tcpdirect_benchmark/gpu_rxq_configuration.textproto`. This file is built +into the DLVM images used by this solution, but may need to be provided if +using an alternative image. + +### Clone the Cluster Toolkit repository containing the NCCL benchmark + +```shell +git clone https://github.com/GoogleCloudPlatform/cluster-toolkit +cd cluster-toolkit/examples/machine-learning/a3-highgpu-8g/nccl-tests +``` + +### Import the PyTorch image from the NVIDIA Container Registry + +```shell +bash import_pytorch_container.sh +``` + +### Build NCCL + +```shell +sbatch build-nccl-tests.sh +``` + +### Run NCCL tests + +```shell +sbatch run-nccl-tests.sh +``` + +[consume]: https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_any_matching_reservation +[tkdeps]: https://cloud.google.com/cluster-toolkit/docs/setup/install-dependencies +[tkinstall]: https://github.com/GoogleCloudPlatform/cluster-toolkit/#quickstart diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml new file mode 100644 index 0000000000..79c06980d3 --- /dev/null +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-0-base.yaml @@ -0,0 +1,61 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +blueprint_name: slurm-a3-base + +terraform_backend_defaults: + type: gcs + configuration: + bucket: customer-tf-state-bucket + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: slurm-a3-base + region: customer-region + zone: customer-zone + sys_net_range: 172.16.0.0/16 + filestore_ip_range: 192.168.0.0/29 + +deployment_groups: +- group: primary + modules: + - id: sysnet + source: modules/network/vpc + settings: + network_name: $(vars.deployment_name)-sysnet + network_address_range: $(vars.sys_net_range) + mtu: 8244 + # using explicit var.subnetworks to allow for easier addition + # of multiple system subnetworks in other regions + subnetworks: + - subnet_name: $(vars.deployment_name)-sysnet-subnet + subnet_region: $(vars.region) + new_bits: 4 + subnet_private_access: true + description: primary subnetwork in $(vars.deployment_name)-sysnet + outputs: + - network_name + - subnetwork_name + + - id: homefs + source: modules/file-system/filestore + use: + - sysnet + settings: + filestore_tier: BASIC_SSD + size_gb: 2560 + local_mount: /home + reserved_ip_range: $(vars.filestore_ip_range) + outputs: + - network_storage diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml new file mode 100644 index 0000000000..e14540f0ed --- /dev/null +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -0,0 +1,231 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +blueprint_name: slurm-a3-image + +terraform_backend_defaults: + type: gcs + configuration: + bucket: customer-tf-state-bucket + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: slurm-a3-image + region: customer-region + zone: customer-zone + disk_size: 200 + final_image_family: slurm-dlvm + network_name_system: slurm-a3-base-sysnet + subnetwork_name_system: slurm-a3-base-sysnet-subnet + slurm_cluster_name: slurm0 + source_image_project_id: source-image-project-id # use value supplied by Google Cloud staff + source_image: source-image-name # use value supplied by Google Cloud staff + +deployment_groups: +- group: build_script + modules: + - id: sysnet + source: modules/network/pre-existing-vpc + settings: + network_name: $(vars.network_name_system) + subnetwork_name: $(vars.subnetwork_name_system) + + - id: image_build_script + source: modules/scripts/startup-script + settings: + install_ansible: true + install_docker: true + enable_docker_world_writable: true + configure_ssh_host_patterns: + - 10.0.0.* + - 10.1.0.* + - 10.2.0.* + - 10.3.0.* + - $(vars.slurm_cluster_name)* + runners: + - type: shell + destination: workaround_apt_change.sh + content: | + #!/bin/bash + # this script is no longer necessary on the most recent TCPX A3 + # images, however it is included for backwards compatibility + set -e -o pipefail + rm -f /etc/apt/sources.list.d/kubernetes.list + apt-get update --allow-releaseinfo-change + - type: shell + destination: disable_dlvm_builtin_services.sh + content: | + #!/bin/bash + # many extra services are being started via /etc/rc.local; disable + # them on future boots of image + echo -e '#!/bin/bash\n/usr/bin/nvidia-persistenced --user root\nexit 0' > /etc/rc.local + # disable jupyter and notebooks-collection-agent services + systemctl stop jupyter.service notebooks-collection-agent.service + systemctl disable jupyter.service notebooks-collection-agent.service + - type: data + destination: /var/tmp/slurm_vars.json + content: | + { + "reboot": false, + "install_cuda": false, + "install_gcsfuse": true, + "install_lustre": true, + "install_ompi": true, + "monitoring_agent": "cloud-ops", + "nvidia_version": "latest", + "slurm_version": "23.11.8" + } + - type: shell + destination: install_slurm.sh + content: | + #!/bin/bash + set -e -o pipefail + ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents + ansible-pull \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.13 \ + -i localhost, --limit localhost --connection=local \ + -e @/var/tmp/slurm_vars.json \ + ansible/playbook.yml + # this duplicates the ulimits configuration of the HPC VM Image + - type: data + destination: /etc/security/limits.d/99-unlimited.conf + content: | + * - memlock unlimited + * - nproc unlimited + * - stack unlimited + * - nofile 1048576 + * - cpu unlimited + * - rtprio unlimited + - type: data + destination: /etc/systemd/system/slurmd.service.d/file_ulimit.conf + content: | + [Service] + LimitNOFILE=infinity + - type: data + destination: /etc/systemd/system/delay-a3.service + content: | + [Unit] + Description=Delay A3 boot until all network interfaces are routable + After=network-online.target + Wants=network-online.target + Before=google-startup-scripts.service + + [Service] + ExecCondition=/bin/bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-highgpu-8g$"' + ExecStart=/usr/lib/systemd/systemd-networkd-wait-online -i enp6s0 -i enp12s0 -i enp134s0 -i enp140s0 -o routable --timeout=120 + ExecStartPost=/bin/sleep 10 + + [Install] + WantedBy=multi-user.target + - type: shell + destination: enable_delay_a3.sh + content: | + #!/bin/bash + set -e -o pipefail + # workaround b/309016676 (systemd-resolved restarts 4 times causing DNS + # resolution failures during google-startup-scripts.service) + systemctl daemon-reload + systemctl enable delay-a3.service + - type: data + destination: /etc/enroot/enroot.conf + content: | + ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime + ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache + ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + - type: ansible-local + destination: configure_gpu_monitoring.yml + content: | + --- + - name: Install NVIDIA DCGM and Configure Ops Agent + hosts: all + become: true + vars: + distribution: "{{ ansible_distribution | lower }}{{ ansible_distribution_version | replace('.','') }}" + package_url: https://developer.download.nvidia.com/compute/cuda/repos/{{ distribution }}/x86_64/cuda-keyring_1.1-1_all.deb + package_filename: /tmp/{{ package_url | basename }} + enable_ops_agent: true + enable_nvidia_dcgm: false + tasks: + - name: Download NVIDIA repository package + ansible.builtin.get_url: + url: "{{ package_url }}" + dest: "{{ package_filename }}" + - name: Install NVIDIA repository package + ansible.builtin.apt: + deb: "{{ package_filename }}" + state: present + - name: Install NVIDIA DCGM + ansible.builtin.apt: + name: + - datacenter-gpu-manager + - libnvidia-nscq-550 + update_cache: true + post_tasks: + - name: Enable Google Cloud Ops Agent + ansible.builtin.service: + name: google-cloud-ops-agent.service + state: "{{ 'started' if enable_ops_agent else 'stopped' }}" + enabled: "{{ enable_ops_agent }}" + - name: Disable NVIDIA DCGM by default (enable during boot on GPU nodes) + ansible.builtin.service: + name: nvidia-dcgm.service + state: stopped + enabled: false + - type: shell + destination: install_mdadm.sh + content: | + #!/bin/bash + # this script ensures that the mdadm package is already present when + # compute nodes boot and use MDADM to RAID local SSD disks + apt-get update + apt-get install mdadm --no-install-recommends --assume-yes + - type: shell + destination: remove_snap_gcloud.sh + content: | + #!/bin/bash + # THIS RUNNER MUST BE THE LAST RUNNER BECAUSE IT WILL BREAK GSUTIL IN + # PARENT SCRIPT OF STARTUP-SCRIPT MODULE + set -e -o pipefail + # Remove original DLVM gcloud, lxds install due to conflict with snapd and NFS + snap remove google-cloud-cli lxd + # Install key and google-cloud-cli from apt repo + GCLOUD_APT_SOURCE="/etc/apt/sources.list.d/google-cloud-sdk.list" + if [ ! -f "${GCLOUD_APT_SOURCE}" ]; then + # indentation matters in EOT below; do not blindly edit! + cat < "${GCLOUD_APT_SOURCE}" + deb [signed-by=/usr/share/keyrings/cloud.google.asc] https://packages.cloud.google.com/apt cloud-sdk main + EOT + fi + curl -o /usr/share/keyrings/cloud.google.asc https://packages.cloud.google.com/apt/doc/apt-key.gpg + apt-get update + apt-get install --assume-yes google-cloud-cli + # Clean up the bash executable hash for subsequent steps using gsutil + hash -r + +- group: slurm-build + modules: + - id: slurm-image + source: modules/packer/custom-image + kind: packer + use: + - image_build_script + - sysnet + settings: + # building this image does not require a GPU-enabled VM but must *not* be + # run on a N-series VM otherwise, the "open" drivers will not install + machine_type: c2d-standard-32 + source_image_project_id: [$(vars.source_image_project_id)] + source_image: $(vars.source_image) + image_family: $(vars.final_image_family) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml new file mode 100644 index 0000000000..364b66f10f --- /dev/null +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -0,0 +1,257 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +blueprint_name: slurm-a3-cluster + +terraform_backend_defaults: + type: gcs + configuration: + bucket: customer-tf-state-bucket # modify to be a bucket owned and writable by customer + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: slurm-a3-cluster + region: customer-region + zone: customer-zone + server_ip_homefs: 0.0.0.0 ## MUST set to IP address of Filestore instance from base deployment! + remote_mount_homefs: /nfsshare + local_mount_homefs: /home + zones: [] + disk_size_gb: 200 + instance_image: + family: slurm-dlvm + project: $(vars.project_id) + instance_image_custom: true + slurm_cluster_name: slurm0 + enable_cleanup_compute: true + a3_partition_name: a3 + a3_static_cluster_size: 32 + # a3_reservation_name must be specified; if Google staff have provided you + # with a reservation name, use it. Otherwise supply user-created reservation. + a3_reservation_name: a3-reservation-0 + # a3_maintenance_interval should be empty string by default; if Google staff + # have created a reservation, they will also provide a3_maintenance_interval + a3_maintenance_interval: "" + # network parameters must match base blueprint deployment_name! + # these values are accurate if deployment_name was not modified from example + network_name_system: slurm-a3-base-sysnet + subnetwork_name_system: slurm-a3-base-sysnet-subnet + # enable Google Cloud Ops Agent logging and monitoring + enable_ops_agent: true + # enable the NVIDIA DCGM daemon and integration into Cloud Ops Agent + enable_nvidia_dcgm: true + +deployment_groups: +- group: cluster + modules: + - id: sysnet + source: modules/network/pre-existing-vpc + settings: + network_name: $(vars.network_name_system) + subnetwork_name: $(vars.subnetwork_name_system) + + - id: gpunets + source: modules/network/multivpc + settings: + global_ip_address_range: 10.0.0.0/9 + network_name_prefix: $(vars.deployment_name)-gpunet + network_count: 4 + subnetwork_cidr_suffix: 20 + + - id: homefs + source: modules/file-system/pre-existing-network-storage + settings: + server_ip: $(vars.server_ip_homefs) + remote_mount: $(vars.remote_mount_homefs) + local_mount: $(vars.local_mount_homefs) + + - id: compute_sa + source: community/modules/project/service-account + settings: + name: compute + project_roles: + - logging.logWriter + - monitoring.metricWriter + - storage.objectAdmin + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: + - sysnet + - compute_sa + settings: + node_count_static: 0 + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + enable_placement: false + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - debug_nodeset + settings: + partition_name: debug + exclusive: false + + - id: a3_startup + source: modules/scripts/startup-script + settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + mountpoint: /mnt/localssd + permissions: 1777 + runners: + - type: ansible-local + destination: enable_nvidia_dcgm.yml + content: | + --- + - name: Enable NVIDIA DCGM on GPU nodes + hosts: all + become: true + vars: + enable_ops_agent: $(vars.enable_ops_agent) + enable_nvidia_dcgm: $(vars.enable_nvidia_dcgm) + tasks: + - name: Update Ops Agent configuration + ansible.builtin.blockinfile: + path: /etc/google-cloud-ops-agent/config.yaml + insertafter: EOF + block: | + metrics: + receivers: + dcgm: + type: dcgm + service: + pipelines: + dcgm: + receivers: + - dcgm + notify: + - Restart Google Cloud Ops Agent + handlers: + - name: Restart Google Cloud Ops Agent + ansible.builtin.service: + name: google-cloud-ops-agent.service + state: "{{ 'restarted' if enable_ops_agent else 'stopped' }}" + enabled: "{{ enable_ops_agent }}" + post_tasks: + - name: Enable Google Cloud Ops Agent + ansible.builtin.service: + name: google-cloud-ops-agent.service + state: "{{ 'started' if enable_ops_agent else 'stopped' }}" + enabled: "{{ enable_ops_agent }}" + - name: Enable NVIDIA DCGM + ansible.builtin.service: + name: nvidia-dcgm.service + state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}" + enabled: "{{ enable_nvidia_dcgm }}" + + - id: a3_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: + - sysnet + - gpunets + - compute_sa + - a3_startup + settings: + reservation_name: $(vars.a3_reservation_name) + maintenance_interval: $(vars.a3_maintenance_interval) + enable_placement: false + node_count_static: $(vars.a3_static_cluster_size) + node_count_dynamic_max: 0 + disk_type: pd-ssd + machine_type: a3-highgpu-8g + enable_public_ips: false + enable_smt: true + node_conf: + CoresPerSocket: 52 + ThreadsPerCore: 2 + on_host_maintenance: TERMINATE + bandwidth_tier: gvnic_enabled + + - id: a3_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - a3_nodeset + settings: + partition_name: $(vars.a3_partition_name) + exclusive: false + is_default: true + partition_conf: + OverSubscribe: EXCLUSIVE + + - id: controller_startup + source: modules/scripts/startup-script + settings: + runners: + - type: shell + destination: stage_scripts.sh + content: | + #!/bin/bash + curl -s --create-dirs -o /opt/apps/adm/slurm/scripts/receive-data-path-manager \ + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/v5/tools/prologs-epilogs/receive-data-path-manager + chmod 0755 /opt/apps/adm/slurm/scripts/receive-data-path-manager + mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d + mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d + ln -s /opt/apps/adm/slurm/scripts/receive-data-path-manager /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d/start-rxdm.prolog_slurmd + ln -s /opt/apps/adm/slurm/scripts/receive-data-path-manager /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d/stop-rxdm.epilog_slurmd + - type: shell + destination: reset_enroot.sh + content: | + #!/bin/bash + # reset enroot to defaults of files under /home and running under /run + # allows basic enroot testing on login/controller nodes (reduced I/O) + rm -f /etc/enroot/enroot.conf + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [sysnet] + settings: + name_prefix: login + disk_type: pd-balanced + machine_type: c2-standard-4 + enable_login_public_ips: true + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - sysnet + - a3_partition + - debug_partition + - slurm_login + - homefs + settings: + cloud_parameters: + resume_rate: 0 + resume_timeout: 900 + suspend_rate: 0 + suspend_timeout: 600 + no_comma_params: false + tree_width: $(vars.a3_static_cluster_size) + machine_type: c2-standard-8 + disk_type: pd-balanced + slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl + enable_controller_public_ips: true + enable_external_prolog_epilog: true + controller_startup_script: $(controller_startup.startup_script) + login_startup_script: | + #!/bin/bash + # reset enroot to defaults of files under /home and running under /run + # allows basic enroot testing on login node (reduced I/O) + rm -f /etc/enroot/enroot.conf diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml index 66415efa9c..ccf24d32f6 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml @@ -41,6 +41,4 @@ cli_deployment_vars: server_ip_homefs: "{{ nfs_ip }}" remote_mount_homefs: "{{ remote_mount_homefs }}" slurm_cluster_name: "{{ slurm_cluster_name }}" - disable_login_public_ips: "false" - disable_controller_public_ips: "false" a3_static_cluster_size: 2 From a227b4e26de3cb1ade49d7fe493b8905ad5b36aa Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 2 Aug 2024 21:47:14 +0000 Subject: [PATCH 095/118] SlurmGCP. Fix multiple bugs around nodeset zones --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 24 +++++++++++++++++-- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 7 ------ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 6 ++--- .../variables.tf | 7 +++--- .../tests/hpc-enterprise-slurm.yml | 2 +- 7 files changed, 31 insertions(+), 18 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 874551c5ea..53f2e653d9 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -150,6 +150,7 @@ No modules. | [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | | [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | +| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | ## Inputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 1553866557..0cc41a8ad5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -93,8 +93,8 @@ locals { instance_properties = var.instance_properties zone_target_shape = var.zone_target_shape - zone_policy_allow = toset(concat([var.zone], tolist(var.zones))) - zone_policy_deny = toset([]) + zone_policy_allow = local.zones + zone_policy_deny = local.zones_deny startup_script = local.ghpc_startup_script network_storage = var.network_storage @@ -105,6 +105,26 @@ data "google_compute_default_service_account" "default" { project = var.project_id } +locals { + zones = setunion(var.zones, [var.zone]) + zones_deny = setsubtract(data.google_compute_zones.available.names, local.zones) +} + +data "google_compute_zones" "available" { + project = var.project_id + region = var.region + + lifecycle { + postcondition { + condition = length(setsubtract(local.zones, self.names)) == 0 + error_message = <<-EOD + Invalid zones=${jsonencode(setsubtract(local.zones, self.names))} + Available zones=${jsonencode(self.names)} + EOD + } + } +} + locals { res_name_split = split("/", var.reservation_name) reservation = var.reservation_name == "" ? null : ( diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index b4c0749759..5112881884 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -361,13 +361,6 @@ variable "zones" { EOD type = set(string) default = [] - - validation { - condition = alltrue([ - for x in var.zones : length(regexall("^[a-z]+-[a-z]+[0-9]-[a-z]$", x)) > 0 - ]) - error_message = "A value in var.zones is not a valid zone (example: us-central1-f)." - } } variable "zone_target_shape" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index d258413fdd..62cd667789 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -272,7 +272,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties = optional(any, null)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
zone_policy_allow = optional(set(string), [])
zone_policy_deny = optional(set(string), [])
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties = optional(any, null)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 0f3f97c0cf..abb7c5135f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -166,7 +166,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None # key is instance name, value overwrites properties body.perInstanceProperties = {k: per_instance_properties(k) for k in nodes} - zones = { + body.locationPolicy.locations = { **{ f"zones/{zone}": {"preference": "ALLOW"} for zone in nodeset.zone_policy_allow or [] @@ -176,9 +176,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None for zone in nodeset.zone_policy_deny or [] }, } - body.locationPolicy.targetShape = cfg.zone_target_shape or "ANY_SINGLE_ZONE" - if zones: - body.locationPolicy.locations = zones + body.locationPolicy.targetShape = nodeset.zone_target_shape if lkp.cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.pre_instance_bulk_insert( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 90377d78cb..44ed33f994 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -181,6 +181,7 @@ variable "login_nodes" { ############ variable "nodeset" { description = "Define nodesets, as a list." + # TODO: remove optional & defaults from fields, since they SHOULD be properly set by nodeset module and not here. type = list(object({ node_count_static = optional(number, 0) node_count_dynamic_max = optional(number, 1) @@ -274,9 +275,9 @@ variable "nodeset" { filename = string content = string })), []) - zone_target_shape = optional(string, "ANY_SINGLE_ZONE") - zone_policy_allow = optional(set(string), []) - zone_policy_deny = optional(set(string), []) + zone_target_shape = string + zone_policy_allow = set(string) + zone_policy_deny = set(string) })) default = [] } diff --git a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml index fd76035c16..8a1b677fee 100644 --- a/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml @@ -23,7 +23,7 @@ cli_deployment_vars: network_name: "{{ network }}" region: europe-west1 zone: "{{ zone }}" - zones: "[europe-west1-b,europe-west1-c,europe-west1-d]" + gpu_zones: "[europe-west1-b,europe-west1-c,europe-west1-d]" workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hpc-enterprise-slurm.yaml" network: "{{ test_name }}-net" From 7a01016a1fe886accbadd3af41bc6f3bfa7949c2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:03:09 +0000 Subject: [PATCH 096/118] Bump google.golang.org/api from 0.189.0 to 0.190.0 Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.189.0 to 0.190.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.189.0...v0.190.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 17 ++++++++--------- go.sum | 32 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index 4688e10928..fc6d32cb19 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.15.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect + google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,11 +27,11 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.189.0 + google.golang.org/api v0.190.0 ) require ( - cloud.google.com/go/auth v0.7.2 // indirect + cloud.google.com/go/auth v0.7.3 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect @@ -40,7 +40,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/googleapis/gax-go/v2 v2.12.5 // indirect + github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/hashicorp/terraform-json v0.22.1 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -54,14 +54,14 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect ) require ( cloud.google.com/go v0.115.0 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect - cloud.google.com/go/iam v1.1.10 // indirect + cloud.google.com/go/iam v1.1.12 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect github.com/agext/levenshtein v1.2.3 @@ -71,8 +71,7 @@ require ( github.com/emirpasic/gods v1.18.1 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/s2a-go v0.1.7 // indirect + github.com/google/s2a-go v0.1.8 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect diff --git a/go.sum b/go.sum index 71a7f3f5dd..c3387b4886 100644 --- a/go.sum +++ b/go.sum @@ -46,8 +46,8 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.7.2 h1:uiha352VrCDMXg+yoBtaD0tUF4Kv9vrtrWPYXwutnDE= -cloud.google.com/go/auth v0.7.2/go.mod h1:VEc4p5NNxycWQTMQEDQF0bd6aTMb6VgYDXEwiJJQAbs= +cloud.google.com/go/auth v0.7.3 h1:98Vr+5jMaCZ5NZk6e/uBgf60phTk/XN84r8QEWB9yjY= +cloud.google.com/go/auth v0.7.3/go.mod h1:HJtWUx1P5eqjy/f6Iq5KeytNpbAcGolPhOgyop2LlzA= cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.10 h1:ZSAr64oEhQSClwBL670MsJAW5/RLiC6kfw3Bqmd5ZDI= -cloud.google.com/go/iam v1.1.10/go.mod h1:iEgMq62sg8zx446GCaijmA2Miwg5o3UbO+nI47WHJps= +cloud.google.com/go/iam v1.1.12 h1:JixGLimRrNGcxvJEQ8+clfLxPlbeZA6MuRJ+qJNQ5Xw= +cloud.google.com/go/iam v1.1.12/go.mod h1:9LDX8J7dN5YRyzVHxwQzrQs9opFFqn0Mxs9nAeB+Hhg= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -353,8 +353,8 @@ github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= -github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= +github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= +github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -373,8 +373,8 @@ github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99 github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= -github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= -github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= +github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= +github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.189.0 h1:equMo30LypAkdkLMBqfeIqtyAnlyig1JSZArl4XPwdI= -google.golang.org/api v0.189.0/go.mod h1:FLWGJKb0hb+pU2j+rJqwbnsF+ym+fQs73rbJ+KAUgy8= +google.golang.org/api v0.190.0 h1:ASM+IhLY1zljNdLu19W1jTmU6A+gMk6M46Wlur61s+Q= +google.golang.org/api v0.190.0/go.mod h1:QIr6I9iedBLnfqoD6L6Vze1UvS5Hzj5r2aUBOaZnLHo= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -978,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240722135656-d784300faade h1:lKFsS7wpngDgSCeFn7MoLy+wBDQZ1UQIJD4UNM1Qvkg= -google.golang.org/genproto v0.0.0-20240722135656-d784300faade/go.mod h1:FfBgJBJg9GcpPvKIuHSZ/aE1g2ecGL74upMzGZjiGEY= -google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 h1:0+ozOGcrp+Y8Aq8TLNN2Aliibms5LEzsq99ZZmAGYm0= -google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094/go.mod h1:fJ/e3If/Q67Mj99hin0hMhiNyCRmt6BQ2aWIJshUSJw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade h1:oCRSWfwGXQsqlVdErcyTt4A93Y8fo0/9D4b1gnI++qo= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf h1:OqdXDEakZCVtDiZTjcxfwbHPCT11ycCEsTKesBVKvyY= +google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:mCr1K1c8kX+1iSBREvU3Juo11CB+QOEWxbRS01wWl5M= +google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f h1:b1Ln/PG8orm0SsBbHZWke8dDp2lrCD4jSmfglFpTZbk= +google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:AHT0dDg3SoMOgZGnZk29b5xTbPHMoEC8qthmBLJCpys= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf h1:liao9UHurZLtiEwBgT9LMOnKYsHze6eA6w1KQCMVN2Q= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From 11e8b7da11ba046f6f62f9d9897a8f6d03558b3e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 10:03:16 +0000 Subject: [PATCH 097/118] Bump golang.org/x/sys from 0.22.0 to 0.23.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.22.0 to 0.23.0. - [Commits](https://github.com/golang/sys/compare/v0.22.0...v0.23.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 4688e10928..92d60404da 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.25.0 // indirect golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.22.0 + golang.org/x/sys v0.23.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index 71a7f3f5dd..5e555540b4 100644 --- a/go.sum +++ b/go.sum @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From b7b45b3ac349f5e319ca453e097eba45810bc92a Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Wed, 31 Jul 2024 19:01:40 +0000 Subject: [PATCH 098/118] support for min_cpu_platform usage --- modules/compute/vm-instance/README.md | 1 + modules/compute/vm-instance/main.tf | 7 ++++--- modules/compute/vm-instance/variables.tf | 6 ++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 592a463d76..b1b5b0b3c2 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -221,6 +221,7 @@ limitations under the License. | [local\_ssd\_interface](#input\_local\_ssd\_interface) | Interface to be used with local SSDs. Can be either 'NVME' or 'SCSI'. No effect unless `local_ssd_count` is also set. | `string` | `"NVME"` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | | [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | | [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. Can use "default" for the default network. | `string` | `null` | no | diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index e58372b7de..14e26b0ba2 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -175,9 +175,10 @@ resource "google_compute_instance" "compute_vm" { depends_on = [var.network_self_link, var.network_storage] - name = "${local.resource_prefix}-${count.index}" - machine_type = var.machine_type - zone = var.zone + name = "${local.resource_prefix}-${count.index}" + min_cpu_platform = var.min_cpu_platform + machine_type = var.machine_type + zone = var.zone resource_policies = google_compute_resource_policy.placement_policy[*].self_link diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index 0a6047bc35..75a8fc5fa5 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -333,6 +333,12 @@ variable "spot" { default = false } +variable "min_cpu_platform" { + description = "The name of the minimum CPU platform that you want the instance to use." + type = string + default = null +} + variable "tags" { description = "Network tags, provided as a list" type = list(string) From 048686bac011e98f2eb1a80c8059adfaa6128292 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 5 Aug 2024 22:38:35 +0000 Subject: [PATCH 099/118] Add doc about DWS Flex Start --- docs/slurm-dws-flex.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 docs/slurm-dws-flex.md diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md new file mode 100644 index 0000000000..889f20266c --- /dev/null +++ b/docs/slurm-dws-flex.md @@ -0,0 +1,28 @@ +# Obtaining SlurmGCP nodes with DWS Flex + +[Dynamic Workload Scheduler](https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler) Flex Start mode is designed for fine-tuning models, experimentation, shorter training jobs, distillation, offline inference, and batch jobs. + +With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. It supports capacity requests for up to seven days, with no minimum duration requirement. You can request capacity for as little as a few minutes or hours; typically, the scheduler can fulfill shorter requests more quickly than longer ones. + +In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: + +```yaml + - id: flex_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + instance_properties: + reservationAffinity: + consumeReservationType: NO_RESERVATION + scheduling: + maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours + onHostMaintenance: TERMINATE + instanceTerminationAction: DELETE + # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc. +``` + +**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days). + +> [!WARNING] +> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample +> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the buildInsert API. From ebe154a5b24af612c49cc12a25aa1c1e8752610a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 5 Aug 2024 19:08:25 -0700 Subject: [PATCH 100/118] Fix typo slurm-dws-flex.md --- docs/slurm-dws-flex.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md index 889f20266c..1ed7ba4128 100644 --- a/docs/slurm-dws-flex.md +++ b/docs/slurm-dws-flex.md @@ -25,4 +25,4 @@ In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a pr > [!WARNING] > The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample -> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the buildInsert API. +> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. From 6663fab677eda91dc7bed2a15715eaea9aa18760 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Sat, 27 Jul 2024 08:31:27 +0000 Subject: [PATCH 101/118] SlurmGCP. Rework logging --- .../slurm_files/scripts/get_tpu_vmcount.py | 4 +- .../modules/slurm_files/scripts/resume.py | 19 ++---- .../modules/slurm_files/scripts/setup.py | 26 +++----- .../scripts/setup_network_storage.py | 9 ++- .../modules/slurm_files/scripts/slurmsync.py | 17 ++--- .../modules/slurm_files/scripts/suspend.py | 15 +---- .../modules/slurm_files/scripts/util.py | 62 ++++++++----------- 7 files changed, 52 insertions(+), 100 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py index 08de35fd8a..0e6a5074ca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py @@ -32,9 +32,7 @@ def get_vmcount_of_tpu_part(part): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) + parser = argparse.ArgumentParser() parser.add_argument( "--partitions", "-p", diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index abb7c5135f..6289268cdd 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -21,7 +21,6 @@ import json import logging import os -import sys import yaml from itertools import chain from pathlib import Path @@ -46,11 +45,7 @@ import slurm_gcp_plugins - -filename = Path(__file__).name -LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") - -log = logging.getLogger(filename) +log = logging.getLogger() global_resume_data = None @@ -603,7 +598,7 @@ def get_resume_file_data(): return None resume_file = Path(SLURM_RESUME_FILE) resume_json = resume_file.read_text() - if args.loglevel == logging.DEBUG: + if log.isEnabledFor(logging.DEBUG): (dirs.scripts / "resume_data.json").write_text(resume_json) return NSDict(json.loads(resume_json)) @@ -637,15 +632,9 @@ def main(nodelist): ) if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) + parser = argparse.ArgumentParser() parser.add_argument("nodelist", help="list of nodes to resume") - - args = util.add_log_args_and_parse(parser) - util.chown_slurm(LOGFILE, mode=0o600) - util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) - sys.excepthook = util.handle_exception + args = util.init_log_and_parse(parser) global_resume_data = get_resume_file_data() main(args.nodelist) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index f3b9b871cb..bee74a9cdf 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -20,7 +20,6 @@ import os import shutil import subprocess -import sys import stat import time from pathlib import Path @@ -43,10 +42,8 @@ setup_nfs_exports, ) -SETUP_SCRIPT = Path(__file__) -filename = SETUP_SCRIPT.name -LOGFILE = ((cfg.slurm_log_dir if cfg else ".") / SETUP_SCRIPT).with_suffix(".log") -log = logging.getLogger(filename) + +log = logging.getLogger() MOTD_HEADER = """ @@ -120,7 +117,7 @@ def end_motd(broadcast=True): def failed_motd(): """modify motd to signal that setup is failed""" - wall_msg = f"*** Slurm setup failed! Please view log: {LOGFILE} ***" + wall_msg = f"*** Slurm setup failed! Please view log: {util.get_log_path()} ***" motd_msg = MOTD_HEADER + wall_msg + "\n\n" Path("/etc/motd").write_text(motd_msg) util.run(f"wall -n '{wall_msg}'", timeout=30) @@ -332,7 +329,7 @@ def setup_controller(): if cfg.controller_secondary_disk: setup_secondary_disks() - setup_network_storage(log) + setup_network_storage() run_custom_scripts() @@ -397,7 +394,7 @@ def setup_login(): update_system_config("slurmd", sysconf) install_custom_scripts() - setup_network_storage(log) + setup_network_storage() setup_sudoers() run("systemctl restart munge") run("systemctl enable slurmd", timeout=30) @@ -439,7 +436,7 @@ def setup_compute(): install_custom_scripts() setup_nss_slurm() - setup_network_storage(log) + setup_network_storage() has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode if has_gpu: @@ -477,16 +474,9 @@ def main(): if __name__ == "__main__": - util.chown_slurm(LOGFILE, mode=0o600) - - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) + parser = argparse.ArgumentParser() parser.add_argument("--slurmd-feature", dest="slurmd_feature", help="Unused, to be removed.") - _ = util.add_log_args_and_parse(parser) - - util.config_root_logger(filename, logfile=LOGFILE) - sys.excepthook = util.handle_exception + _ = util.init_log_and_parse(parser) lkp = util.Lookup(cfg) # noqa F811 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index a99203e022..65e5301481 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -19,6 +19,7 @@ import sys import stat import time +import logging import shutil from pathlib import Path @@ -30,6 +31,8 @@ from more_executors import Executors, ExceptionRetryPolicy +log = logging.getLogger() + def mounts_by_local(mounts): """convert list of mounts to dict of mounts, local_mount as key""" return {str(Path(m.local_mount).resolve()): m for m in mounts} @@ -91,7 +94,7 @@ def internal_mount(mount): return separate(internal_mount, mounts) -def setup_network_storage(log): +def setup_network_storage(): """prepare network fs mounts and add them to fstab""" log.info("Set up network storage") # filter mounts into two dicts, cluster-internal and external mounts @@ -154,7 +157,7 @@ def setup_network_storage(log): f.write("\n") mount_fstab(mounts_by_local(mounts), log) - munge_mount_handler(log) + munge_mount_handler() def mount_fstab(mounts, log): @@ -189,7 +192,7 @@ def mount_path(path): raise e -def munge_mount_handler(log): +def munge_mount_handler(): if not cfg.munge_mount: log.error("Missing munge_mount in cfg") elif lkp.is_controller: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 9d67314c68..e648d6b80c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -45,15 +45,12 @@ TPU, chunked, ) -from util import lkp, cfg, CONFIG_FILE +from util import lkp, CONFIG_FILE from suspend import delete_instances from resume import start_tpu import conf -filename = Path(__file__).name -LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") - -log = logging.getLogger(filename) +log = logging.getLogger() TOT_REQ_CNT = 1000 @@ -503,14 +500,8 @@ def main(): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - - args = util.add_log_args_and_parse(parser) - util.chown_slurm(LOGFILE, mode=0o600) - util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) - sys.excepthook = util.handle_exception + parser = argparse.ArgumentParser() + _ = util.init_log_and_parse(parser) pid_file = (Path("/tmp") / Path(__file__).name).with_suffix(".pid") with pid_file.open("w") as fp: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index 84cb2274d8..9848e5a995 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -18,8 +18,6 @@ from typing import List import argparse import logging -import sys -from pathlib import Path import util from util import ( @@ -31,13 +29,11 @@ separate, execute_with_futures, ) -from util import lkp, cfg, TPU +from util import lkp, TPU import slurm_gcp_plugins -filename = Path(__file__).name -LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") -log = logging.getLogger(filename) +log = logging.getLogger() TOT_REQ_CNT = 1000 @@ -151,11 +147,6 @@ def main(nodelist): description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument("nodelist", help="list of nodes to suspend") - - args = util.add_log_args_and_parse(parser) - util.chown_slurm(LOGFILE, mode=0o600) - util.config_root_logger(filename, level=args.loglevel, logfile=LOGFILE) - log = logging.getLogger(Path(__file__).name) - sys.excepthook = util.handle_exception + args = util.init_log_and_parse(parser) main(args.nodelist) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 626c052223..37ad4b1a7a 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -193,8 +193,7 @@ def create_client_options(api: Optional[ApiEndpoint] = None) -> ClientOptions: log.debug(f"Using ClientOptions = {co} for API: {api}") return co -logging.basicConfig(level=logging.INFO, stream=sys.stdout) -log = logging.getLogger(__name__) +log = logging.getLogger() def access_secret_version(project_id, secret_id, version_id="latest"): @@ -485,12 +484,18 @@ def save_config(cfg, path): def owned_file_handler(filename): """create file handler""" - if filename is None: - return None chown_slurm(filename) return logging.handlers.WatchedFileHandler(filename, delay=True) -def add_log_args_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: +def get_log_path() -> Path: + """ + Returns path to log file for the current script. + e.g. resume.py -> /var/log/slurm/resume.log + """ + log_dir = Path(cfg.slurm_log_dir or ".") + return (log_dir / Path(sys.argv[0]).name).with_suffix(".log") + +def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: parser.add_argument( "--debug", "-d", @@ -508,27 +513,22 @@ def add_log_args_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespac ) args = parser.parse_args() + loglevel = args.loglevel if cfg.enable_debug_logging: - args.loglevel = logging.DEBUG + loglevel = logging.DEBUG if args.trace_api: cfg.extra_logging_flags["trace_api"] = True - return args - - -def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): - """configure the root logger, disabling all existing loggers""" - handlers = list(compress(("stdout_handler", "file_handler"), (stdout, logfile))) - - config = { + # Configure root logger + logging.config.dictConfig({ "version": 1, "disable_existing_loggers": True, "formatters": { "standard": { - "fmt": "%(levelname)s: %(message)s", + "format": "%(levelname)s: %(message)s", }, "stamp": { - "fmt": "%(asctime)s %(levelname)s: %(message)s", + "format": "%(asctime)s %(levelname)s: %(message)s", }, }, "handlers": { @@ -542,32 +542,23 @@ def config_root_logger(caller_logger, level="DEBUG", stdout=True, logfile=None): "()": owned_file_handler, "level": logging.DEBUG, "formatter": "stamp", - "filename": logfile, + "filename": get_log_path(), }, }, "root": { - "handlers": handlers, - "level": level, + "handlers": ["stdout_handler", "file_handler"], + "level": loglevel, }, - } - if not logfile: - del config["handlers"]["file_handler"] - logging.config.dictConfig(config) - loggers = ( - __name__, - "resume", - "suspend", - "slurmsync", - "setup", - caller_logger, - ) - for logger in map(logging.getLogger, loggers): - logger.disabled = False + }) + + sys.excepthook = _handle_exception + + return args def log_api_request(request): """log.trace info about a compute API request""" - if not cfg.extra_logging_flags.get("trace_api", False): + if not cfg.extra_logging_flags.get("trace_api"): return # output the whole request object as pretty yaml @@ -580,9 +571,8 @@ def log_api_request(request): log.debug(f"{inspect.stack()[1].function}:\n{pretty_req}") -def handle_exception(exc_type, exc_value, exc_trace): +def _handle_exception(exc_type, exc_value, exc_trace): """log exceptions other than KeyboardInterrupt""" - # TODO does this work? if not issubclass(exc_type, KeyboardInterrupt): log.exception("Fatal exception", exc_info=(exc_type, exc_value, exc_trace)) sys.__excepthook__(exc_type, exc_value, exc_trace) From 631044e8f777f9a5221846f134ce75c5a713b211 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 6 Aug 2024 13:25:21 -0500 Subject: [PATCH 102/118] Add test for legacy v5 a3-highgpu-8g blueprint --- tools/cloud-build/provision/README.md | 2 + tools/cloud-build/provision/legacy-tests.tf | 53 +++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tools/cloud-build/provision/legacy-tests.tf diff --git a/tools/cloud-build/provision/README.md b/tools/cloud-build/provision/README.md index 946d954dd7..1a279c774d 100644 --- a/tools/cloud-build/provision/README.md +++ b/tools/cloud-build/provision/README.md @@ -38,6 +38,7 @@ When prompted for project, use integration test project. | [daily\_project\_cleanup\_filestore\_schedule](#module\_daily\_project\_cleanup\_filestore\_schedule) | ./trigger-schedule | n/a | | [daily\_project\_cleanup\_slurm\_schedule](#module\_daily\_project\_cleanup\_slurm\_schedule) | ./trigger-schedule | n/a | | [daily\_test\_schedule](#module\_daily\_test\_schedule) | ./trigger-schedule | n/a | +| [legacy\_test\_schedule](#module\_legacy\_test\_schedule) | ./trigger-schedule | n/a | | [weekly\_build\_dependency\_check\_schedule](#module\_weekly\_build\_dependency\_check\_schedule) | ./trigger-schedule | n/a | ## Resources @@ -48,6 +49,7 @@ When prompted for project, use integration test project. | [google_cloudbuild_trigger.daily_project_cleanup_slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.daily_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.image_build_test_runner](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | +| [google_cloudbuild_trigger.legacy_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_go_build_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_ofe_test](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | | [google_cloudbuild_trigger.pr_ofe_venv](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloudbuild_trigger) | resource | diff --git a/tools/cloud-build/provision/legacy-tests.tf b/tools/cloud-build/provision/legacy-tests.tf new file mode 100644 index 0000000000..53bc295969 --- /dev/null +++ b/tools/cloud-build/provision/legacy-tests.tf @@ -0,0 +1,53 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + legacy_tests = [ + ["ml-a3-highgpu-slurm", "refs/tags/v1.37.1"], + ] +} + +resource "google_cloudbuild_trigger" "legacy_test" { + count = length(local.legacy_tests) + name = "LEGACY-test-${local.legacy_tests[count.index][0]}" + description = "Runs the '${local.legacy_tests[count.index][0]}' integration test against last supported release" + tags = [local.notify_chat_tag] + + git_file_source { + path = "tools/cloud-build/daily-tests/builds/${local.legacy_tests[count.index][0]}.yaml" + revision = local.legacy_tests[count.index][1] + uri = var.repo_uri + repo_type = "GITHUB" + } + + source_to_build { + uri = var.repo_uri + ref = local.legacy_tests[count.index][1] + repo_type = "GITHUB" + } + # Following fields will be auto-set by CloudBuild after creation + # Specify it explicitly to reduce discreppancy. + ignored_files = [] + included_files = [] + substitutions = {} +} + +# TODO: build solution for scheduling tests in sequence when we have +# more than 1 test +module "legacy_test_schedule" { + source = "./trigger-schedule" + count = length(google_cloudbuild_trigger.legacy_test) + trigger = google_cloudbuild_trigger.legacy_test[count.index] + schedule = "30 5 * * MON-FRI" +} From 2d6f7ab851eb3d2889d4e2a11953b888c82ce989 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 6 Aug 2024 19:05:16 +0000 Subject: [PATCH 103/118] Update SlurmGCP 6.6.0 -> 6.6.1 --- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 12 ++++++------ .../schedmd-slurm-gcp-v6-controller/controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/partition.tf | 4 ++-- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d5eb02d407..96ec1ec149 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 90e42683a4..9dc55b9d9e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -54,7 +54,7 @@ data "google_compute_default_service_account" "default" { module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 62cd667789..8347c2f78b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -196,13 +196,13 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.0 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.1 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 2262acf718..31e186b6d4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -36,7 +36,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" project_id = var.project_id region = var.region @@ -92,7 +92,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 103c224046..e693dc22f8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -57,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 23a9f414ee..d2d85db1fd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" for_each = local.nodeset_map project_id = var.project_id @@ -87,7 +87,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.6.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.6.1" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index e9572dc52c..70e760958d 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.1 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml From c9cc9dc424a0427f4b94660c5b1810093a55f336 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 17 Jul 2024 16:50:39 -0700 Subject: [PATCH 104/118] Add `disable_automatic_updates` flag --- .../compute/htcondor-execute-point/README.md | 3 +- .../htcondor-execute-point/compute_image.tf | 30 +++++++++++++++++++ .../compute/htcondor-execute-point/main.tf | 27 +++++++---------- .../htcondor-execute-point/variables.tf | 10 +++++++ .../schedmd-slurm-gcp-v5-node-group/README.md | 1 + .../schedmd-slurm-gcp-v5-node-group/main.tf | 9 +++++- .../source_image_logic.tf | 6 ++++ .../variables.tf | 10 +++++++ .../README.md | 1 + .../main.tf | 9 +++++- .../source_image_logic.tf | 6 ++++ .../variables.tf | 11 +++++++ .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 9 +++++- .../source_image_logic.tf | 6 ++++ .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 10 +++++++ .../scheduler/htcondor-access-point/README.md | 1 + .../scheduler/htcondor-access-point/main.tf | 16 ++++++++-- .../htcondor-access-point/variables.tf | 10 +++++++ .../htcondor-central-manager/README.md | 1 + .../htcondor-central-manager/main.tf | 16 ++++++++-- .../htcondor-central-manager/variables.tf | 10 +++++++ .../schedmd-slurm-gcp-v5-controller/README.md | 1 + .../schedmd-slurm-gcp-v5-controller/main.tf | 8 ++++- .../source_image_logic.tf | 6 ++++ .../variables.tf | 10 +++++++ .../schedmd-slurm-gcp-v5-login/README.md | 1 + .../schedmd-slurm-gcp-v5-login/main.tf | 11 +++++-- .../source_image_logic.tf | 6 ++++ .../schedmd-slurm-gcp-v5-login/variables.tf | 11 +++++++ .../schedmd-slurm-gcp-v6-controller/README.md | 1 + .../controller.tf | 10 ++++++- .../source_image_logic.tf | 6 ++++ .../variables_controller_instance.tf | 9 ++++++ .../schedmd-slurm-gcp-v6-login/README.md | 1 + .../schedmd-slurm-gcp-v6-login/main.tf | 9 +++++- .../source_image_logic.tf | 6 ++++ .../schedmd-slurm-gcp-v6-login/variables.tf | 10 +++++++ modules/compute/vm-instance/README.md | 3 +- modules/compute/vm-instance/compute_image.tf | 30 +++++++++++++++++++ modules/compute/vm-instance/main.tf | 16 +++++----- modules/compute/vm-instance/variables.tf | 18 ++++++++--- .../scheduler/batch-job-template/README.md | 4 +++ .../batch-job-template/compute_image.tf | 30 +++++++++++++++++++ modules/scheduler/batch-job-template/main.tf | 16 +++++++--- .../scheduler/batch-job-template/variables.tf | 10 +++++++ .../scheduler/batch-job-template/versions.tf | 4 +++ .../daily-tests/blueprints/e2e.yaml | 1 + tools/duplicate-diff.py | 5 ++++ 49 files changed, 401 insertions(+), 45 deletions(-) create mode 100644 community/modules/compute/htcondor-execute-point/compute_image.tf create mode 100644 modules/compute/vm-instance/compute_image.tf create mode 100644 modules/scheduler/batch-job-template/compute_image.tf diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 4f47ccf39c..78cdcc5ab4 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -219,7 +219,7 @@ limitations under the License. | Name | Type | |------|------| | [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_image.compute_image](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | ## Inputs @@ -228,6 +228,7 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for template | `string` | `"pd-balanced"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | diff --git a/community/modules/compute/htcondor-execute-point/compute_image.tf b/community/modules/compute/htcondor-execute-point/compute_image.tf new file mode 100644 index 0000000000..012117e65e --- /dev/null +++ b/community/modules/compute/htcondor-execute-point/compute_image.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_compute_image" "compute_image" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = try(var.instance_image.project, null) + + lifecycle { + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } + } +} diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 697e446e35..175e283597 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -31,12 +31,20 @@ locals { windows_startup_ps1 = join("\n\n", flatten([var.windows_startup_ps1, local.execute_config_windows_startup_ps1])) - is_windows_image = anytrue([for l in data.google_compute_image.htcondor.licenses : length(regexall("windows-cloud", l)) > 0]) + is_windows_image = anytrue([for l in data.google_compute_image.compute_image.licenses : length(regexall("windows-cloud", l)) > 0]) windows_startup_metadata = local.is_windows_image && local.windows_startup_ps1 != "" ? { windows-startup-script-ps1 = local.windows_startup_ps1 } : {} - metadata = merge(local.windows_startup_metadata, local.network_storage_metadata, local.enable_oslogin, var.metadata) + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.windows_startup_metadata, + local.network_storage_metadata, + local.enable_oslogin, + local.disable_automatic_updates_metadata, + var.metadata + ) autoscaler_runner = { "type" = "ansible-local" @@ -100,19 +108,6 @@ locals { name_prefix = "${var.deployment_name}-${var.name_prefix}-ep" } -data "google_compute_image" "htcondor" { - family = try(var.instance_image.family, null) - name = try(var.instance_image.name, null) - project = var.instance_image.project - - lifecycle { - postcondition { - condition = self.disk_size_gb <= var.disk_size_gb - error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" - } - } -} - data "google_compute_zones" "available" { project = var.project_id region = var.region @@ -156,7 +151,7 @@ module "execute_point_instance_template" { preemptible = var.spot startup_script = local.is_windows_image ? null : module.startup_script.startup_script metadata = local.metadata - source_image = data.google_compute_image.htcondor.self_link + source_image = data.google_compute_image.compute_image.self_link # secure boot enable_shielded_vm = var.enable_shielded_vm diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index dc974fc643..4124f99f93 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -99,6 +99,16 @@ variable "instance_image" { } } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "execute_point_service_account_email" { description = "Service account for HTCondor execute point (e-mail format)" type = string diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index e71f638778..207b21c110 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -124,6 +124,7 @@ No modules. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index 825f3c0a4a..f7b1f1be0d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -20,6 +20,13 @@ locals { } locals { + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata + ) + enable_public_ip_access_config = var.disable_public_ips ? [] : [{ nat_ip = null, network_tier = null }] access_config = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config @@ -59,7 +66,7 @@ locals { labels = local.labels machine_type = var.machine_type maintenance_interval = var.maintenance_interval - metadata = var.metadata + metadata = local.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index a92bd5fc8d..86a975e9ee 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index ca45a0333c..b686a69c59 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -429,3 +429,13 @@ variable "disable_public_ips" { type = bool default = true } + +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d5eb02d407..b6a09e818d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -92,6 +92,7 @@ modules. For support with the underlying modules, see the instructions in the | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 90e42683a4..d72cf34306 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -20,7 +20,14 @@ locals { locals { nodeset_name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) feature = coalesce(var.feature, local.nodeset_name) - metadata = merge(var.metadata, { slurmd_feature = local.feature }) + + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + { slurmd_feature = local.feature }, + var.metadata + ) nodeset = { nodeset_name = local.nodeset_name diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index 6198e7539d..d040de355c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 504fd8bad5..7c569315b6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -99,6 +99,17 @@ variable "instance_image_custom" { default = false } + +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "tags" { type = list(string) description = "Network tag list." diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 53f2e653d9..4d818278e1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -161,6 +161,7 @@ No modules. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 0cc41a8ad5..5843890e24 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -18,6 +18,13 @@ locals { } locals { + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata + ) + name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) additional_disks = [ @@ -71,7 +78,7 @@ locals { labels = local.labels machine_type = var.machine_type - metadata = var.metadata + metadata = local.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index 6198e7539d..d040de355c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 5112881884..cfd2b2efd6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -111,6 +111,16 @@ variable "instance_image_custom" { default = false } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "tags" { type = list(string) description = "Network tag list." diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index ced39f5a83..7639d0bd85 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -147,6 +147,7 @@ limitations under the License. | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [default\_mig\_id](#input\_default\_mig\_id) | Default MIG ID for HTCondor jobs; if unset, jobs must specify MIG id | `string` | `""` | no | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `32` | no | | [disk\_type](#input\_disk\_type) | Boot disk size in GB | `string` | `"pd-balanced"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape acoss zones for instance group managing high availability of access point | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 68bf8ace00..9acebbd3b2 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -25,8 +25,14 @@ locals { "DISABLE" = "FALSE" "ENABLE" = "TRUE" } - enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - metadata = merge(local.network_storage_metadata, local.enable_oslogin_metadata, var.metadata) + enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + metadata = merge( + local.network_storage_metadata, + local.enable_oslogin_metadata, + local.disable_automatic_updates_metadata, + var.metadata + ) host_count = 1 name_prefix = "${var.deployment_name}-ap" @@ -114,6 +120,12 @@ data "google_compute_image" "htcondor" { condition = self.disk_size_gb <= var.disk_size_gb error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 0f8ef72d51..88cf3102bb 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -164,6 +164,16 @@ variable "instance_image" { } } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "machine_type" { description = "Machine type to use for HTCondor central managers" type = string diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 4491b7bb9e..3b09a96876 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -125,6 +125,7 @@ limitations under the License. | [central\_manager\_runner](#input\_central\_manager\_runner) | A list of Toolkit runners for configuring an HTCondor central manager | `list(map(string))` | `[]` | no | | [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-setup module) | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape for instance group managing high availability of central manager | `string` | `"ANY_SINGLE_ZONE"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 21003d3cae..039dabc606 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -25,8 +25,14 @@ locals { "DISABLE" = "FALSE" "ENABLE" = "TRUE" } - enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - metadata = merge(local.network_storage_metadata, local.enable_oslogin_metadata, var.metadata) + enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + metadata = merge( + local.network_storage_metadata, + local.enable_oslogin_metadata, + local.disable_automatic_updates_metadata, + var.metadata + ) name_prefix = "${var.deployment_name}-cm" @@ -81,6 +87,12 @@ data "google_compute_image" "htcondor" { condition = self.disk_size_gb <= var.disk_size_gb error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 8166acef61..6836e435d6 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -132,6 +132,16 @@ variable "instance_image" { } } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "machine_type" { description = "Machine type to use for HTCondor central managers" type = string diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 7a481df2b3..9ede17a9fd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -240,6 +240,7 @@ limitations under the License. | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to the controller\_startup\_script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | | [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index bce1a83cf2..5c1a578d0d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -20,6 +20,12 @@ locals { } locals { + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata + ) ghpc_startup_script_controller = [{ filename = "ghpc_startup.sh" @@ -109,7 +115,7 @@ module "slurm_controller_template" { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type - metadata = var.metadata + metadata = local.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index a92bd5fc8d..86a975e9ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 7dd719db17..9a68d14900 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -587,6 +587,16 @@ variable "instance_image_custom" { default = false } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + # tflint-ignore: terraform_unused_declarations variable "source_image_project" { type = string diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 070f9a457d..46c6e16905 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -101,6 +101,7 @@ limitations under the License. | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance. This value
must be supplied as an output of the controller module, typically via `use`. | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index c925a1c229..e9e78494bb 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -20,6 +20,13 @@ locals { } locals { + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata + ) + ghpc_startup_script = [{ filename = "ghpc_startup.sh" content = var.startup_script @@ -66,7 +73,7 @@ module "slurm_login_template" { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type - metadata = var.metadata + metadata = local.metadata min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible @@ -102,7 +109,7 @@ module "slurm_login_instance" { subnetwork = var.subnetwork_self_link zone = var.zone login_startup_scripts = local.ghpc_startup_script - metadata = var.metadata + metadata = local.metadata slurm_depends_on = var.controller_instance_id == null ? [] : [var.controller_instance_id] enable_reconfigure = var.enable_reconfigure pubsub_topic = var.pubsub_topic diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index a92bd5fc8d..86a975e9ee 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 54749e9e8e..dec973e086 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -326,6 +326,17 @@ variable "instance_image_custom" { default = false } + +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + # tflint-ignore: terraform_unused_declarations variable "source_image_project" { type = string diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 62cd667789..f39edb80eb 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -236,6 +236,7 @@ limitations under the License. | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | DEPRECATED: Use `enable_default_mounts` instead. | `bool` | `null` | no | | [disable\_smt](#input\_disable\_smt) | DEPRECATED: Use `enable_smt` instead. | `bool` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 2262acf718..53a321e3da 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -32,6 +32,14 @@ locals { email = local.service_account_email scopes = var.service_account_scopes } + + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata, + local.universe_domain + ) } # INSTANCE TEMPLATE @@ -63,7 +71,7 @@ module "slurm_controller_template" { gpu = one(local.guest_accelerator) machine_type = var.machine_type - metadata = merge(var.metadata, local.universe_domain) + metadata = local.metadata min_cpu_platform = var.min_cpu_platform # network_ip = TODO: add support for network_ip diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index 6198e7539d..d040de355c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index d40cceec2a..b65be063f3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -298,6 +298,15 @@ variable "instance_image_custom" { default = false } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} variable "tags" { type = list(string) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index d9fe020b65..3f90314718 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -88,6 +88,7 @@ No modules. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | DEPRECATED: Use `enable_login_public_ips` instead. | `bool` | `null` | no | | [disable\_smt](#input\_disable\_smt) | DEPRECATED: Use `enable_smt` instead. | `bool` | `null` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index ab3e45fe9c..f6b835e4a2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -18,6 +18,13 @@ locals { } locals { + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.disable_automatic_updates_metadata, + var.metadata + ) + additional_disks = [ for ad in var.additional_disks : { disk_name = ad.disk_name @@ -68,7 +75,7 @@ locals { gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type - metadata = var.metadata + metadata = local.metadata min_cpu_platform = var.min_cpu_platform num_instances = var.num_instances on_host_maintenance = var.on_host_maintenance diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index 6198e7539d..d040de355c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -68,5 +68,11 @@ data "google_compute_image" "slurm" { condition = var.disk_size_gb >= self.disk_size_gb error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" } + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 880f56f440..dad97bcd8d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -356,6 +356,16 @@ variable "instance_image_custom" { default = false } +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} + variable "tags" { type = list(string) description = "Network tag list." diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index b1b5b0b3c2..1bb9b86cf0 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -209,6 +209,7 @@ limitations under the License. | [automatic\_restart](#input\_automatic\_restart) | Specifies if the instance should be restarted if it was terminated by Compute Engine (not a user). | `bool` | `null` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, will optionally be used name resources according to `name_prefix` | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to true, instances will not have public IPs | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | @@ -222,7 +223,7 @@ limitations under the License. | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | +| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | | [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. Can use "default" for the default network. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | diff --git a/modules/compute/vm-instance/compute_image.tf b/modules/compute/vm-instance/compute_image.tf new file mode 100644 index 0000000000..012117e65e --- /dev/null +++ b/modules/compute/vm-instance/compute_image.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_compute_image" "compute_image" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = try(var.instance_image.project, null) + + lifecycle { + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } + } +} diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 14e26b0ba2..e942ec8fea 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -64,6 +64,8 @@ locals { } enable_oslogin = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + # Network Interfaces # Support for `use` input and base network parameters like `network_self_link` and `subnetwork_self_link` empty_access_config = { @@ -92,12 +94,6 @@ locals { ] } -data "google_compute_image" "compute_image" { - family = try(var.instance_image.family, null) - name = try(var.instance_image.name, null) - project = try(var.instance_image.project, null) -} - resource "null_resource" "image" { triggers = { name = try(var.instance_image.name, null), @@ -258,7 +254,13 @@ resource "google_compute_instance" "compute_vm" { } } - metadata = merge(local.network_storage, local.startup_script, local.enable_oslogin, var.metadata) + metadata = merge( + local.network_storage, + local.startup_script, + local.enable_oslogin, + local.disable_automatic_updates_metadata, + var.metadata + ) lifecycle { ignore_changes = [ diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index 75a8fc5fa5..dc22681a22 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -76,9 +76,9 @@ variable "local_ssd_interface" { variable "name_prefix" { description = <<-EOT - An optional name for all VM and disk resources. - If not supplied, `deployment_name` will be used. - When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, + An optional name for all VM and disk resources. + If not supplied, `deployment_name` will be used. + When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". EOT type = string @@ -310,7 +310,7 @@ variable "placement_policy" { validation { condition = var.placement_policy == null ? true : try(keys(var.placement_policy), null) != null error_message = <<-EOT - The var.placement_policy should be either unset/null or be a map/object with + The var.placement_policy should be either unset/null or be a map/object with fields: vm_count (number), availability_domain_count (number), collocation (string), max_distance (number). EOT } @@ -398,3 +398,13 @@ variable "allocate_ip" { }) default = null } + +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 3ba68b8bf7..d99e158329 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -122,6 +122,7 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 0.13.0 | +| [google](#requirement\_google) | >= 4.0 | | [local](#requirement\_local) | >= 2.0.0 | | [null](#requirement\_null) | ~> 3.0 | | [random](#requirement\_random) | >= 3.0 | @@ -130,6 +131,7 @@ limitations under the License. | Name | Version | |------|---------| +| [google](#provider\_google) | >= 4.0 | | [local](#provider\_local) | >= 2.0.0 | | [null](#provider\_null) | ~> 3.0 | | [random](#provider\_random) | >= 3.0 | @@ -149,12 +151,14 @@ limitations under the License. | [local_file.submit_script](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | | [null_resource.submit_job](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [random_id.submit_job_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | +| [google_compute_image.compute_image](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used for the job\_id | `string` | n/a | yes | +| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | | [image](#input\_image) | DEPRECATED: Google Cloud Batch compute node image. Ignored if `instance_template` is provided. | `any` | `null` | no | diff --git a/modules/scheduler/batch-job-template/compute_image.tf b/modules/scheduler/batch-job-template/compute_image.tf new file mode 100644 index 0000000000..012117e65e --- /dev/null +++ b/modules/scheduler/batch-job-template/compute_image.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_compute_image" "compute_image" { + family = try(var.instance_image.family, null) + name = try(var.instance_image.name, null) + project = try(var.instance_image.project, null) + + lifecycle { + postcondition { + # Condition needs to check the suffix of the license, as prefix contains an API version which can change. + # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates + condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + } + } +} diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index c4bf6fc02f..d7a06cb4fe 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -78,6 +78,14 @@ locals { on_host_maintenance_default = local.gpu_attached ? "TERMINATE" : "MIGRATE" on_host_maintenance = coalesce(var.on_host_maintenance, local.on_host_maintenance_default) + + network_storage_metadata = var.network_storage != null ? ({ network_storage = jsonencode(var.network_storage) }) : {} + disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + + metadata = merge( + local.network_storage_metadata, + local.disable_automatic_updates_metadata + ) } module "instance_template" { @@ -94,10 +102,10 @@ module "instance_template" { machine_type = var.machine_type startup_script = local.startup_from_network_storage - metadata = var.network_storage != null ? ({ network_storage = jsonencode(var.network_storage) }) : {} - source_image_family = try(var.instance_image.family, "") - source_image = try(var.instance_image.name, "") - source_image_project = var.instance_image.project + metadata = local.metadata + source_image_family = data.google_compute_image.compute_image.family + source_image = data.google_compute_image.compute_image.name + source_image_project = data.google_compute_image.compute_image.project on_host_maintenance = local.on_host_maintenance } diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index 3f348c4509..11043e9d0c 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -227,3 +227,13 @@ variable "submit" { type = bool default = false } + +variable "disable_automatic_updates" { + description = <<-EOT + If true, disables automatic updates on the created instances. This feature is only available on + supported images (or images derived from them). For more details, see + https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates + EOT + type = bool + default = false +} diff --git a/modules/scheduler/batch-job-template/versions.tf b/modules/scheduler/batch-job-template/versions.tf index ecc6075056..f244d1b4e2 100644 --- a/modules/scheduler/batch-job-template/versions.tf +++ b/modules/scheduler/batch-job-template/versions.tf @@ -28,6 +28,10 @@ terraform { source = "hashicorp/random" version = ">= 3.0" } + google = { + source = "hashicorp/google" + version = ">= 4.0" + } } required_version = ">= 0.13.0" } diff --git a/tools/cloud-build/daily-tests/blueprints/e2e.yaml b/tools/cloud-build/daily-tests/blueprints/e2e.yaml index 6d7a1c788e..6a2b38807a 100644 --- a/tools/cloud-build/daily-tests/blueprints/e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/e2e.yaml @@ -34,3 +34,4 @@ deployment_groups: use: [network] settings: machine_type: n2-standard-2 + disable_automatic_updates: true diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 567cdda571..64d1154f5e 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -86,6 +86,11 @@ "modules/file-system/parallelstore/scripts/mount-daos.sh", "modules/file-system/pre-existing-network-storage/scripts/mount-daos.sh", ], + [ + "modules/compute/vm-instance/compute_image.tf" + "modules/scheduler/batch-job-template/compute_image.tf" + "community/modules/compute/htcondor-execute-point/compute_image.tf" + ], ] for group in duplicates: From 430b33e3f20c9fcd1e16e68788511a2902f8f3d7 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 6 Aug 2024 12:05:58 -0700 Subject: [PATCH 105/118] Change var from `disable_automatic_updates` to `allow_automatic_updates` --- .../modules/compute/htcondor-execute-point/README.md | 2 +- .../compute/htcondor-execute-point/compute_image.tf | 4 ++-- .../modules/compute/htcondor-execute-point/main.tf | 2 +- .../compute/htcondor-execute-point/variables.tf | 9 +++++---- .../compute/schedmd-slurm-gcp-v5-node-group/README.md | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/main.tf | 2 +- .../source_image_logic.tf | 4 ++-- .../schedmd-slurm-gcp-v5-node-group/variables.tf | 9 +++++---- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 6 +++--- .../schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf | 2 +- .../source_image_logic.tf | 4 ++-- .../schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf | 9 +++++---- .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 4 ++-- .../compute/schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- .../source_image_logic.tf | 4 ++-- .../compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 11 ++++++----- .../modules/scheduler/htcondor-access-point/README.md | 2 +- .../modules/scheduler/htcondor-access-point/main.tf | 6 +++--- .../scheduler/htcondor-access-point/variables.tf | 9 +++++---- .../scheduler/htcondor-central-manager/README.md | 2 +- .../scheduler/htcondor-central-manager/main.tf | 6 +++--- .../scheduler/htcondor-central-manager/variables.tf | 9 +++++---- .../schedmd-slurm-gcp-v5-controller/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/main.tf | 2 +- .../source_image_logic.tf | 4 ++-- .../schedmd-slurm-gcp-v5-controller/variables.tf | 9 +++++---- .../scheduler/schedmd-slurm-gcp-v5-login/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v5-login/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/source_image_logic.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v5-login/variables.tf | 9 +++++---- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../schedmd-slurm-gcp-v6-controller/controller.tf | 2 +- .../source_image_logic.tf | 4 ++-- .../variables_controller_instance.tf | 9 +++++---- .../scheduler/schedmd-slurm-gcp-v6-login/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v6-login/main.tf | 2 +- .../schedmd-slurm-gcp-v6-login/source_image_logic.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v6-login/variables.tf | 9 +++++---- modules/compute/vm-instance/README.md | 2 +- modules/compute/vm-instance/compute_image.tf | 4 ++-- modules/compute/vm-instance/main.tf | 2 +- modules/compute/vm-instance/variables.tf | 9 +++++---- modules/scheduler/batch-job-template/README.md | 4 ++-- modules/scheduler/batch-job-template/compute_image.tf | 4 ++-- modules/scheduler/batch-job-template/main.tf | 2 +- modules/scheduler/batch-job-template/variables.tf | 6 +++--- modules/scheduler/batch-job-template/versions.tf | 2 +- tools/cloud-build/daily-tests/blueprints/e2e.yaml | 2 +- 48 files changed, 113 insertions(+), 102 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 78cdcc5ab4..36dd0a9ffd 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -226,9 +226,9 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for template | `string` | `"pd-balanced"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | diff --git a/community/modules/compute/htcondor-execute-point/compute_image.tf b/community/modules/compute/htcondor-execute-point/compute_image.tf index 012117e65e..7a7fe02307 100644 --- a/community/modules/compute/htcondor-execute-point/compute_image.tf +++ b/community/modules/compute/htcondor-execute-point/compute_image.tf @@ -23,8 +23,8 @@ data "google_compute_image" "compute_image" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 175e283597..17f5419fe4 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -36,7 +36,7 @@ locals { windows-startup-script-ps1 = local.windows_startup_ps1 } : {} - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.windows_startup_metadata, diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 4124f99f93..aab8a54c2d 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -99,14 +99,15 @@ variable "instance_image" { } } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "execute_point_service_account_email" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 207b21c110..9c73c92674 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -122,9 +122,9 @@ No modules. | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the node group instances can be accessed via the internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index f7b1f1be0d..ae8b93e4d3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -20,7 +20,7 @@ locals { } locals { - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf index 86a975e9ee..1df327a60b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index b686a69c59..86b9f8d021 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -430,12 +430,13 @@ variable "disable_public_ips" { default = true } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index b6a09e818d..c00a6cb690 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -21,10 +21,10 @@ The following code snippet creates an instance template to be used by MIG. settings: partition_name: mp is_default: true - + - id: controller source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller - use: [network, dynamic_partition] + use: [network, dynamic_partition] - id: mig source: community/modules/compute/mig @@ -90,9 +90,9 @@ modules. For support with the underlying modules, see the instructions in the | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index d72cf34306..c3aea8c416 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -21,7 +21,7 @@ locals { nodeset_name = substr(replace(var.name, "/[^a-z0-9]/", ""), 0, 14) feature = coalesce(var.feature, local.nodeset_name) - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index d040de355c..35125a3221 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 7c569315b6..3c8b0743dd 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -100,14 +100,15 @@ variable "instance_image_custom" { } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "tags" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 4d818278e1..cea9e8e862 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -159,9 +159,9 @@ No modules. | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | @@ -177,7 +177,7 @@ No modules. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | +| [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 5843890e24..099920990a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -18,7 +18,7 @@ locals { } locals { - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index d040de355c..35125a3221 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index cfd2b2efd6..62cddfeb48 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -111,14 +111,15 @@ variable "instance_image_custom" { default = false } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "tags" { @@ -490,7 +491,7 @@ variable "instance_properties" { Override the instance properties. Used to test features not supported by Slurm GCP, recommended for advanced usage only. See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert - If any sub-field (e.g. scheduling) is set, it will override the values computed by + If any sub-field (e.g. scheduling) is set, it will override the values computed by SlurmGCP and ignoring values of provided vars. EOD type = any diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index 7639d0bd85..0f68d42964 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -143,11 +143,11 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [access\_point\_runner](#input\_access\_point\_runner) | A list of Toolkit runners for configuring an HTCondor access point | `list(map(string))` | `[]` | no | | [access\_point\_service\_account\_email](#input\_access\_point\_service\_account\_email) | Service account for access point (e-mail format) | `string` | n/a | yes | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [autoscaler\_runner](#input\_autoscaler\_runner) | A list of Toolkit runners for configuring autoscaling daemons | `list(map(string))` | `[]` | no | | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [default\_mig\_id](#input\_default\_mig\_id) | Default MIG ID for HTCondor jobs; if unset, jobs must specify MIG id | `string` | `""` | no | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `32` | no | | [disk\_type](#input\_disk\_type) | Boot disk size in GB | `string` | `"pd-balanced"` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape acoss zones for instance group managing high availability of access point | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 9acebbd3b2..0c76da3706 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -26,7 +26,7 @@ locals { "ENABLE" = "TRUE" } enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.network_storage_metadata, local.enable_oslogin_metadata, @@ -123,8 +123,8 @@ data "google_compute_image" "htcondor" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 88cf3102bb..f54a88ac2e 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -164,14 +164,15 @@ variable "instance_image" { } } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "machine_type" { diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 3b09a96876..eed11b3d99 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -122,10 +122,10 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [central\_manager\_runner](#input\_central\_manager\_runner) | A list of Toolkit runners for configuring an HTCondor central manager | `list(map(string))` | `[]` | no | | [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-setup module) | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | | [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape for instance group managing high availability of central manager | `string` | `"ANY_SINGLE_ZONE"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 039dabc606..52bd0072c7 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -26,7 +26,7 @@ locals { "ENABLE" = "TRUE" } enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.network_storage_metadata, local.enable_oslogin_metadata, @@ -90,8 +90,8 @@ data "google_compute_image" "htcondor" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 6836e435d6..7f85861c3f 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -132,14 +132,15 @@ variable "instance_image" { } } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "machine_type" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 9ede17a9fd..50f719575a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -231,6 +231,7 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | | [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = bool
resume_rate = number
resume_timeout = number
suspend_rate = number
suspend_timeout = number
})
|
{
"no_comma_params": false,
"resume_rate": 0,
"resume_timeout": 300,
"suspend_rate": 0,
"suspend_timeout": 300
}
| no | @@ -240,7 +241,6 @@ limitations under the License. | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no | | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to the controller\_startup\_script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | | [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 5c1a578d0d..92d7a9d840 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -20,7 +20,7 @@ locals { } locals { - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf index 86a975e9ee..1df327a60b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 9a68d14900..e921ba3dc6 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -587,14 +587,15 @@ variable "instance_image_custom" { default = false } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } # tflint-ignore: terraform_unused_declarations diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 46c6e16905..269d729a6f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -98,10 +98,10 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | | [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance. This value
must be supplied as an output of the controller module, typically via `use`. | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index e9e78494bb..af9254ae74 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -20,7 +20,7 @@ locals { } locals { - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf index 86a975e9ee..1df327a60b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index dec973e086..a86bab126f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -327,14 +327,15 @@ variable "instance_image_custom" { } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } # tflint-ignore: terraform_unused_declarations diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index f39edb80eb..1633d9079f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -223,6 +223,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. If not specified, then one will be chosen based on slurm\_cluster\_name. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | @@ -236,7 +237,6 @@ limitations under the License. | [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | DEPRECATED: Use `enable_default_mounts` instead. | `bool` | `null` | no | | [disable\_smt](#input\_disable\_smt) | DEPRECATED: Use `enable_smt` instead. | `bool` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 53a321e3da..4352e67ab7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -33,7 +33,7 @@ locals { scopes = var.service_account_scopes } - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index d040de355c..35125a3221 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index b65be063f3..69eea81844 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -298,14 +298,15 @@ variable "instance_image_custom" { default = false } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "tags" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 3f90314718..d8078eb573 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -86,9 +86,9 @@ No modules. |------|-------------|------|---------|:--------:| | [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | DEPRECATED: Use `enable_login_public_ips` instead. | `bool` | `null` | no | | [disable\_smt](#input\_disable\_smt) | DEPRECATED: Use `enable_smt` instead. | `bool` | `null` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index f6b835e4a2..035f1dc426 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -18,7 +18,7 @@ locals { } locals { - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.disable_automatic_updates_metadata, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index d040de355c..35125a3221 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -71,8 +71,8 @@ data "google_compute_image" "slurm" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index dad97bcd8d..f7d4cacd85 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -356,14 +356,15 @@ variable "instance_image_custom" { default = false } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } variable "tags" { diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 1bb9b86cf0..fb3fde84e6 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -205,11 +205,11 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [add\_deployment\_name\_before\_prefix](#input\_add\_deployment\_name\_before\_prefix) | If true, the names of VMs and disks will always be prefixed with `deployment_name` to enable uniqueness across deployments.
See `name_prefix` for further details on resource naming behavior. | `bool` | `false` | no | | [allocate\_ip](#input\_allocate\_ip) | If not null, allocate IPs with the given configuration. See details at
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address |
object({
address_type = optional(string, "INTERNAL")
purpose = optional(string),
network_tier = optional(string),
ip_version = optional(string, "IPV4"),
})
| `null` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | | [automatic\_restart](#input\_automatic\_restart) | Specifies if the instance should be restarted if it was terminated by Compute Engine (not a user). | `bool` | `null` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, will optionally be used name resources according to `name_prefix` | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to true, instances will not have public IPs | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | diff --git a/modules/compute/vm-instance/compute_image.tf b/modules/compute/vm-instance/compute_image.tf index 012117e65e..7a7fe02307 100644 --- a/modules/compute/vm-instance/compute_image.tf +++ b/modules/compute/vm-instance/compute_image.tf @@ -23,8 +23,8 @@ data "google_compute_image" "compute_image" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index e942ec8fea..683fa77682 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -64,7 +64,7 @@ locals { } enable_oslogin = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } # Network Interfaces # Support for `use` input and base network parameters like `network_self_link` and `subnetwork_self_link` diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index dc22681a22..f675325187 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -399,12 +399,13 @@ variable "allocate_ip" { default = null } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index d99e158329..345aaf638e 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -121,7 +121,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 4.0 | | [local](#requirement\_local) | >= 2.0.0 | | [null](#requirement\_null) | ~> 3.0 | @@ -157,8 +157,8 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used for the job\_id | `string` | n/a | yes | -| [disable\_automatic\_updates](#input\_disable\_automatic\_updates) | If true, disables automatic updates on the created instances. This feature is only available on
supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | | [image](#input\_image) | DEPRECATED: Google Cloud Batch compute node image. Ignored if `instance_template` is provided. | `any` | `null` | no | diff --git a/modules/scheduler/batch-job-template/compute_image.tf b/modules/scheduler/batch-job-template/compute_image.tf index 012117e65e..7a7fe02307 100644 --- a/modules/scheduler/batch-job-template/compute_image.tf +++ b/modules/scheduler/batch-job-template/compute_image.tf @@ -23,8 +23,8 @@ data "google_compute_image" "compute_image" { postcondition { # Condition needs to check the suffix of the license, as prefix contains an API version which can change. # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = !var.disable_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "The 'disable_automatic_updates' feature is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" + condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) + error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" } } } diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index d7a06cb4fe..bb378ea7b1 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -80,7 +80,7 @@ locals { on_host_maintenance = coalesce(var.on_host_maintenance, local.on_host_maintenance_default) network_storage_metadata = var.network_storage != null ? ({ network_storage = jsonencode(var.network_storage) }) : {} - disable_automatic_updates_metadata = var.disable_automatic_updates ? { google_disable_automatic_updates = "TRUE" } : {} + disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } metadata = merge( local.network_storage_metadata, diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index 11043e9d0c..da214e29ea 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -228,10 +228,10 @@ variable "submit" { default = false } -variable "disable_automatic_updates" { +variable "allow_automatic_updates" { description = <<-EOT - If true, disables automatic updates on the created instances. This feature is only available on - supported images (or images derived from them). For more details, see + If false, disables automatic system package updates on the created instances. This feature is + only available on supported images (or images derived from them). For more details, see https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool diff --git a/modules/scheduler/batch-job-template/versions.tf b/modules/scheduler/batch-job-template/versions.tf index f244d1b4e2..a1161e1354 100644 --- a/modules/scheduler/batch-job-template/versions.tf +++ b/modules/scheduler/batch-job-template/versions.tf @@ -33,5 +33,5 @@ terraform { version = ">= 4.0" } } - required_version = ">= 0.13.0" + required_version = ">= 1.1" } diff --git a/tools/cloud-build/daily-tests/blueprints/e2e.yaml b/tools/cloud-build/daily-tests/blueprints/e2e.yaml index 6a2b38807a..cafb3a194e 100644 --- a/tools/cloud-build/daily-tests/blueprints/e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/e2e.yaml @@ -34,4 +34,4 @@ deployment_groups: use: [network] settings: machine_type: n2-standard-2 - disable_automatic_updates: true + allow_automatic_updates: false From bb9f206554b5a962e1764cf0a1761dba0b97c5a6 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Mon, 5 Aug 2024 09:10:40 +0000 Subject: [PATCH 106/118] Update local ssd examples to use local ssd startup solution --- .../hpc-slurm-local-ssd-v5-legacy.yaml | 43 ++++++------------ community/examples/hpc-slurm-local-ssd.yaml | 44 ++++++------------- .../daily-tests/builds/slurm-gcp-v6-ssd.yaml | 1 + 3 files changed, 29 insertions(+), 59 deletions(-) diff --git a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml b/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml index c8b18d1f8f..a10116544d 100644 --- a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml +++ b/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml @@ -37,6 +37,19 @@ deployment_groups: settings: local_mount: /home + - id: startup + source: modules/scripts/startup-script + settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + fs_type: ext4 + mountpoint: /mnt/localssd + permissions: "0755" # must quote numeric filesystem permissions! + - id: compute_node_group source: community/modules/compute/schedmd-slurm-gcp-v5-node-group settings: @@ -66,39 +79,11 @@ deployment_groups: - network1 - homefs - compute_node_group + - startup settings: is_default: true partition_name: ssdcomp region: us-central1 - startup_script: | - #!/bin/bash - set -e -o pipefail - - # this script assumes it is running on a RedHat-derivative OS - yum install -y mdadm - - RAID_DEVICE=/dev/md0 - DST_MNT=/mnt/localssd - DISK_LABEL=LOCALSSD - OPTIONS=discard,defaults - - # if mount is successful, do nothing - if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then - exit 0 - fi - - # Create new RAID, format ext4 and mount - # TODO: handle case of zero or 1 local SSD disk - # TODO: handle case when /dev/md0 exists but was not mountable - DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '` - NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l` - mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES - mkfs.ext4 -F "$RAID_DEVICE" - tune2fs "$RAID_DEVICE" -r 131072 - e2label "$RAID_DEVICE" "$DISK_LABEL" - mkdir -p "$DST_MNT" - mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS" - chmod 1777 "$DST_MNT" - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index 6540d0e8cc..f2e1b7e8e4 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -37,9 +37,22 @@ deployment_groups: settings: local_mount: /home + - id: startup + source: modules/scripts/startup-script + settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + fs_type: ext4 + mountpoint: /mnt/localssd + permissions: "0755" # must quote numeric filesystem permissions! + - id: nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] + use: [network, startup] settings: additional_disks: - device_name: test-disk-1 @@ -60,35 +73,6 @@ deployment_groups: machine_type: c2-standard-4 node_count_dynamic_max: 5 node_count_static: 0 - startup_script: | - #!/bin/bash - set -e -o pipefail - - # this script assumes it is running on a RedHat-derivative OS - yum install -y mdadm - - RAID_DEVICE=/dev/md0 - DST_MNT=/mnt/localssd - DISK_LABEL=LOCALSSD - OPTIONS=discard,defaults - - # if mount is successful, do nothing - if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then - exit 0 - fi - - # Create new RAID, format ext4 and mount - # TODO: handle case of zero or 1 local SSD disk - # TODO: handle case when /dev/md0 exists but was not mountable - DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '` - NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l` - mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES - mkfs.ext4 -F "$RAID_DEVICE" - tune2fs "$RAID_DEVICE" -r 131072 - e2label "$RAID_DEVICE" "$DISK_LABEL" - mkdir -p "$DST_MNT" - mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS" - chmod 1777 "$DST_MNT" - id: partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml index 63d46afc19..1f55daa395 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml @@ -19,6 +19,7 @@ tags: - m.schedmd-slurm-gcp-v6-login - m.schedmd-slurm-gcp-v6-nodeset - m.schedmd-slurm-gcp-v6-partition +- m.startup-script - m.vpc - slurm6 From cb8c7fb8d2ba49d78b1ba0ae4a802fb0827e985e Mon Sep 17 00:00:00 2001 From: Alyssa Date: Mon, 5 Aug 2024 18:17:07 +0000 Subject: [PATCH 107/118] Update a3-megagpu-8 example to use local SSD solution --- .../a3-megagpu-8g/slurm-a3mega-cluster.yaml | 8 +++ .../a3-megagpu-8g/slurm-a3mega-image.yaml | 72 ------------------- 2 files changed, 8 insertions(+), 72 deletions(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 5ec78fc43e..5dae64cb67 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -80,6 +80,14 @@ deployment_groups: - id: a3mega_startup source: modules/scripts/startup-script settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + mountpoint: /mnt/localssd + permissions: "0755" # must quote numeric filesystem permissions! runners: - type: shell destination: setup_aperture.sh diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index e9572dc52c..e9f4a6bd89 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -228,78 +228,6 @@ deployment_groups: ansible.builtin.apt: name: dmabuf-import-helper state: present - - type: ansible-local - destination: mount-local-ssd-service.yml - content: | - --- - - name: Enable mount-local-ssd.service - hosts: all - become: true - tasks: - - name: Install mdadm - ansible.builtin.package: - name: mdadm - state: present - - name: Install local SSD formatting script - ansible.builtin.copy: - dest: /usr/local/ghpc/mount_localssd.sh - owner: root - group: root - mode: 0o755 - content: | - #!/bin/bash - set -e -o pipefail - - RAID_DEVICE=/dev/md0 - DST_MNT=/mnt/localssd - DISK_LABEL=LOCALSSD - OPTIONS=discard,defaults - - # if mount is successful, do nothing - if mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS"; then - exit 0 - fi - - # Create new RAID, format ext4 and mount - # TODO: handle case of zero or 1 local SSD disk - # TODO: handle case when /dev/md0 exists but was not mountable for - # some reason - DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | paste -sd ' '` - NB_DEVICES=`nvme list | grep nvme_ | grep -v nvme_card-pd | awk '{print $1}' | wc -l` - mdadm --create "$RAID_DEVICE" --level=0 --raid-devices=$NB_DEVICES $DEVICES - mkfs.ext4 -F "$RAID_DEVICE" - tune2fs "$RAID_DEVICE" -r 131072 - e2label "$RAID_DEVICE" "$DISK_LABEL" - mkdir -p "$DST_MNT" - mount --source LABEL="$DISK_LABEL" --target="$DST_MNT" -o "$OPTIONS" - chmod 1777 "$DST_MNT" - - name: Configure mount-local-ssd.service - ansible.builtin.copy: - dest: /etc/systemd/system/mount-local-ssd.service - owner: root - group: root - mode: 0o644 - content: | - [Unit] - Description=Assemble local SSDs as software RAID; then format and mount - - [Service] - ExecCondition=bash -c '/usr/bin/curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/machine-type | grep -q "/a3-megagpu-8g$"' - ExecStart=/bin/bash /usr/local/ghpc/mount_localssd.sh - - [Install] - WantedBy=local-fs.target - notify: Reload SystemD - handlers: - - name: Reload SystemD - ansible.builtin.systemd: - daemon_reload: true - post_tasks: - - name: Start Local SSD service - ansible.builtin.service: - name: mount-local-ssd.service - state: started - enabled: true - type: ansible-local destination: timesyncd.yml content: | From 088598d26d3652fff5253d190dd21369acc6225f Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 7 Aug 2024 20:11:24 +0000 Subject: [PATCH 108/118] DWS. Add form to sing up for allowlist --- docs/slurm-dws-flex.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md index 1ed7ba4128..ffea0bec16 100644 --- a/docs/slurm-dws-flex.md +++ b/docs/slurm-dws-flex.md @@ -4,6 +4,10 @@ With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. It supports capacity requests for up to seven days, with no minimum duration requirement. You can request capacity for as little as a few minutes or hours; typically, the scheduler can fulfill shorter requests more quickly than longer ones. +> [!IMPORTANT] +> The project needs to be allowlisted for private preview access. +> Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). + In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: ```yaml From 37a987443864f409859458b405936da6f633ed50 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 8 Aug 2024 00:42:42 +0000 Subject: [PATCH 109/118] Remove `docs/slurm-dws-flex.md` --- docs/slurm-dws-flex.md | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 docs/slurm-dws-flex.md diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md deleted file mode 100644 index ffea0bec16..0000000000 --- a/docs/slurm-dws-flex.md +++ /dev/null @@ -1,32 +0,0 @@ -# Obtaining SlurmGCP nodes with DWS Flex - -[Dynamic Workload Scheduler](https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler) Flex Start mode is designed for fine-tuning models, experimentation, shorter training jobs, distillation, offline inference, and batch jobs. - -With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. It supports capacity requests for up to seven days, with no minimum duration requirement. You can request capacity for as little as a few minutes or hours; typically, the scheduler can fulfill shorter requests more quickly than longer ones. - -> [!IMPORTANT] -> The project needs to be allowlisted for private preview access. -> Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). - -In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: - -```yaml - - id: flex_nodeset - source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset - use: [network] - settings: - instance_properties: - reservationAffinity: - consumeReservationType: NO_RESERVATION - scheduling: - maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours - onHostMaintenance: TERMINATE - instanceTerminationAction: DELETE - # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc. -``` - -**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days). - -> [!WARNING] -> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample -> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. From dcafdde36572d825f8e4652d941366b18aef19fc Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 8 Aug 2024 17:53:52 -0700 Subject: [PATCH 110/118] Align variable name around gcluster to avoid mismatch --- .../ansible_playbooks/tasks/gather_startup_script_logs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml index 6b5808ba8b..a499e43cad 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/gather_startup_script_logs.yml @@ -16,14 +16,14 @@ - name: Assert variables are defined ansible.builtin.assert: that: - - ghpc_stderr is defined + - gcluster_stderr is defined # Searches the ghpc stderr for a command that gathers the serial logs from the # deployed VM, defaults to an empty string if the command is not found - name: Get serial port command failed_when: false ansible.builtin.set_fact: - serial_port_cmd: '{{ ghpc_stderr | regex_findall("please run:\s+(.+?\s+--project\s+\S+)", "\\1") | first | default("") }}' + serial_port_cmd: '{{ gcluster_stderr | regex_findall("please run:\s+(.+?\s+--project\s+\S+)", "\\1") | first | default("") }}' - name: Print serial port command failed_when: false From 0cbd0ce8c03af4c50a10e37dee1c5bf5c8a631c9 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 6 Aug 2024 16:12:30 -0700 Subject: [PATCH 111/118] Add setting `allow_automatic_updates: false` to examples --- community/examples/AMD/hpc-amd-slurm.yaml | 2 ++ community/examples/client-google-cloud-storage.yaml | 1 + community/examples/hpc-build-slurm-image.yaml | 1 + community/examples/hpc-slurm-gromacs.yaml | 1 + community/examples/hpc-slurm-local-ssd.yaml | 1 + community/examples/hpc-slurm-ramble-gromacs.yaml | 1 + community/examples/hpc-slurm-sharedvpc.yaml | 2 ++ community/examples/hpc-slurm6-apptainer.yaml | 1 + community/examples/hpc-slurm6-tpu-maxtext.yaml | 1 + community/examples/htc-htcondor.yaml | 2 ++ community/examples/htc-slurm.yaml | 4 ++++ community/examples/omnia-cluster.yaml | 1 + community/examples/tutorial-fluent.yaml | 1 + community/examples/tutorial-starccm.yaml | 1 + examples/cae/cae-slurm.yaml | 2 ++ examples/hcls-blueprint.yaml | 3 +++ examples/hpc-enterprise-slurm.yaml | 7 +++++++ examples/hpc-slurm-static.yaml | 2 ++ examples/hpc-slurm.yaml | 3 +++ examples/image-builder.yaml | 1 + examples/pfs-lustre.yaml | 1 + examples/ps-slurm.yaml | 1 + examples/serverless-batch-mpi.yaml | 4 +++- examples/serverless-batch.yaml | 3 ++- modules/scheduler/batch-job-template/README.md | 2 +- modules/scheduler/batch-job-template/variables.tf | 3 ++- tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml | 1 + 27 files changed, 49 insertions(+), 4 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 0eb1f71571..5decf96a2d 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -179,6 +179,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -194,6 +195,7 @@ deployment_groups: node_count_dynamic_max: 50 bandwidth_tier: gvnic_enabled enable_placement: true + allow_automatic_updates: false # Because is_default is set to true, jobs will run on this partition unless an # alternative partition is specified using, for example, "srun -p lowcost" diff --git a/community/examples/client-google-cloud-storage.yaml b/community/examples/client-google-cloud-storage.yaml index e23abeed3a..b876b0b42b 100644 --- a/community/examples/client-google-cloud-storage.yaml +++ b/community/examples/client-google-cloud-storage.yaml @@ -57,6 +57,7 @@ deployment_groups: settings: name_prefix: workstation machine_type: e2-standard-2 + allow_automatic_updates: false - id: wait source: community/modules/scripts/wait-for-startup diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index f721019844..491cfb7c65 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -104,6 +104,7 @@ deployment_groups: settings: machine_type: n2d-standard-2 instance_image: $(vars.built_instance_image) + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index a6ccf8867c..af6b8864b8 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -97,6 +97,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index f2e1b7e8e4..9921a5c621 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -73,6 +73,7 @@ deployment_groups: machine_type: c2-standard-4 node_count_dynamic_max: 5 node_count_static: 0 + allow_automatic_updates: false - id: partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 5396c4aef3..523b543c53 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -118,6 +118,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml index 6f18ad040f..827824e432 100644 --- a/community/examples/hpc-slurm-sharedvpc.yaml +++ b/community/examples/hpc-slurm-sharedvpc.yaml @@ -62,6 +62,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -77,6 +78,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 09a02fa9d4..6848b1b4f0 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -78,6 +78,7 @@ deployment_groups: instance_image: $(vars.custom_image) instance_image_custom: true bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml index ab88b6f2de..5e172cd5c2 100644 --- a/community/examples/hpc-slurm6-tpu-maxtext.yaml +++ b/community/examples/hpc-slurm6-tpu-maxtext.yaml @@ -100,6 +100,7 @@ deployment_groups: name: ns2 node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 08281bdaa3..ce93439b67 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -104,6 +104,7 @@ deployment_groups: name_prefix: grp1 instance_image: $(vars.new_image) min_idle: 2 + allow_automatic_updates: false - id: htcondor_execute_point_spot source: community/modules/compute/htcondor-execute-point @@ -117,6 +118,7 @@ deployment_groups: name_prefix: spot instance_image: $(vars.new_image) spot: true + allow_automatic_updates: false - id: htcondor_access source: community/modules/scheduler/htcondor-access-point diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index fb9be4c147..7165923bbb 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -80,6 +80,7 @@ deployment_groups: node_count_dynamic_max: 200 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: compute_nodeset_c2s30 source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset @@ -89,6 +90,7 @@ deployment_groups: machine_type: c2-standard-30 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -112,6 +114,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_nodeset_n2s4 source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset @@ -122,6 +125,7 @@ deployment_groups: node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled enable_placement: false + allow_automatic_updates: false - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index a54a7d376e..89ecfcc263 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -89,6 +89,7 @@ deployment_groups: name_prefix: omnia-compute add_deployment_name_before_prefix: true instance_count: 2 + allow_automatic_updates: false # This module simply makes terraform wait until the startup script is complete - id: wait diff --git a/community/examples/tutorial-fluent.yaml b/community/examples/tutorial-fluent.yaml index 23341903b8..0ff70e009e 100644 --- a/community/examples/tutorial-fluent.yaml +++ b/community/examples/tutorial-fluent.yaml @@ -144,6 +144,7 @@ deployment_groups: vm_count: 4 # Note: should match instance count collocation: "COLLOCATED" availability_domain_count: null + allow_automatic_updates: false - id: login source: modules/compute/vm-instance diff --git a/community/examples/tutorial-starccm.yaml b/community/examples/tutorial-starccm.yaml index db86f35518..91a7af09ad 100644 --- a/community/examples/tutorial-starccm.yaml +++ b/community/examples/tutorial-starccm.yaml @@ -70,6 +70,7 @@ deployment_groups: vm_count: null collocation: "COLLOCATED" availability_domain_count: null + allow_automatic_updates: false - source: community/modules/scripts/wait-for-startup kind: terraform diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 920d1967ba..a3e9820ab9 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -163,6 +163,7 @@ deployment_groups: machine_type: h3-standard-88 disk_type: 'pd-balanced' bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: h3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -181,6 +182,7 @@ deployment_groups: machine_type: c3-highmem-176 disk_type: 'pd-balanced' bandwidth_tier: tier_1_enabled + allow_automatic_updates: false - id: c3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml index 1c77626f66..ee55925236 100644 --- a/examples/hcls-blueprint.yaml +++ b/examples/hcls-blueprint.yaml @@ -265,6 +265,7 @@ deployment_groups: add_deployment_name_before_prefix: true threads_per_core: 2 machine_type: c2-standard-16 + allow_automatic_updates: false - group: cluster modules: @@ -300,6 +301,7 @@ deployment_groups: name: ns node_count_dynamic_max: 20 machine_type: c2-standard-60 + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -317,6 +319,7 @@ deployment_groups: node_count_dynamic_max: 20 machine_type: g2-standard-4 enable_placement: False + allow_automatic_updates: false - id: gpu_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 21dc9e15f9..3ef0ba990f 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -108,6 +108,7 @@ deployment_groups: machine_type: n2-standard-2 instance_image: $(vars.slurm_image) enable_placement: false # the default is: true + allow_automatic_updates: false - id: n2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -130,6 +131,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false # use `-p c2` to submit jobs to this partition: # ex: `srun -p c2 -N 1 hostname` @@ -151,6 +153,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false - id: c2d_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -168,6 +171,7 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 + allow_automatic_updates: false - id: c3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -191,6 +195,7 @@ deployment_groups: node_conf: SocketsPerBoard: 2 CoresPerSocket: 24 + allow_automatic_updates: false # use `-p a208` to submit jobs to this partition: # ex: `srun -p a208 --gpus-per-node=8 -N 1 nvidia-smi` @@ -220,6 +225,7 @@ deployment_groups: node_conf: SocketsPerBoard: 2 CoresPerSocket: 24 + allow_automatic_updates: false # use `-p a216` to submit jobs to this partition: # ex: `srun -p a216 --gpus-per-node=16 -N 1 nvidia-smi` @@ -246,6 +252,7 @@ deployment_groups: # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks disk_type: pd-balanced disk_size_gb: 100 + allow_automatic_updates: false # use `-p h3` to submit jobs to this partition: # ex: `srun -p h3 -N 1 hostname` diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index 41f2aac52c..fff15e07dc 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -51,6 +51,7 @@ deployment_groups: reservation_name: $(vars.static_reservation_name) machine_type: $(vars.static_reservation_machine_type) instance_image: $(vars.slurm_instance_image) + allow_automatic_updates: false - id: static_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [static_nodeset] @@ -66,6 +67,7 @@ deployment_groups: machine_type: c2d-standard-112 node_count_dynamic_max: 100 instance_image: $(vars.slurm_instance_image) + allow_automatic_updates: false - id: dynamic_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [dynamic_nodeset] diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index c425c041df..0a90bdcc89 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -47,6 +47,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -63,6 +64,7 @@ deployment_groups: settings: node_count_dynamic_max: 20 bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -82,6 +84,7 @@ deployment_groups: # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks disk_type: pd-balanced bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: h3_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 6bc7f6161d..63f5d89fbd 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -77,6 +77,7 @@ deployment_groups: instance_image: $(vars.custom_image) instance_image_custom: true bandwidth_tier: gvnic_enabled + allow_automatic_updates: false - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/pfs-lustre.yaml b/examples/pfs-lustre.yaml index 6354ead1b8..1da2de65c9 100644 --- a/examples/pfs-lustre.yaml +++ b/examples/pfs-lustre.yaml @@ -46,3 +46,4 @@ deployment_groups: add_deployment_name_before_prefix: true instance_count: 2 machine_type: n2-standard-2 + allow_automatic_updates: false diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index 4a28802924..11c492d6e3 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -48,6 +48,7 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: c2-standard-60 enable_placement: false # the default is: true + allow_automatic_updates: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index af156a2a83..9dd329d6b8 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -21,7 +21,7 @@ vars: region: us-central1 zone: us-central1-c instance_image: - family: hpc-centos-7 + family: hpc-rocky-linux-8 project: cloud-hpc-image-public deployment_groups: @@ -147,6 +147,7 @@ deployment_groups: name_prefix: spack-builder add_deployment_name_before_prefix: true machine_type: c2-standard-16 + allow_automatic_updates: false ### Batch Modules ### - id: batch-job @@ -158,6 +159,7 @@ deployment_groups: machine_type: c2-standard-60 task_count: 2 mpi_mode: true + allow_automatic_updates: false - id: batch-login source: modules/scheduler/batch-login-node diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index 6931077248..538e7d9671 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -47,8 +47,9 @@ deployment_groups: task_count: 8 task_count_per_node: 4 instance_image: - family: batch-centos-7-official + family: batch-hpc-rocky-linux-8-official project: batch-custom-image + allow_automatic_updates: false - id: batch-login source: modules/scheduler/batch-login-node diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 345aaf638e..8cf6e6d276 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -157,7 +157,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `false` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used for the job\_id | `string` | n/a | yes | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index da214e29ea..bfce75666e 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -235,5 +235,6 @@ variable "allow_automatic_updates" { https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates EOT type = bool - default = false + default = true + nullable = false } diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml index dcc3d41bc8..54043e0beb 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-centos7.yml @@ -25,6 +25,7 @@ cli_deployment_vars: slurm_image: "{family: slurm-gcp-6-6-hpc-centos-7, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c + allow_automatic_updates: false zone: us-west4-c workspace: /workspace From b0789b842bc4e3a16271f0a32f884b9d04002621 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 8 Aug 2024 18:09:50 -0700 Subject: [PATCH 112/118] Build is taking longer, extend timeout so test will pass while investigating --- tools/cloud-build/daily-tests/builds/batch-mpi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index fd29f37721..7595534a18 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -56,7 +56,7 @@ steps: echo ' source: community/modules/scripts/wait-for-startup' >> $${SG_EXAMPLE} echo ' settings:' >> $${SG_EXAMPLE} echo ' instance_name: $(spack-builder.name[0])' >> $${SG_EXAMPLE} - echo ' timeout: 2400' >> $${SG_EXAMPLE} + echo ' timeout: 10800' >> $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ From 19407839b92bcad03a41d9215fed4163d812f42c Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 12 Aug 2024 14:15:16 -0700 Subject: [PATCH 113/118] Bump version from v1.37.2 to v1.38.0 --- cmd/root.go | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/compute/mig/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/network/private-service-access/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/gke-node-pool/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/file-system/gke-persistent-volume/versions.tf | 2 +- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/firewall-rules/versions.tf | 2 +- modules/network/pre-existing-subnetwork/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scheduler/gke-cluster/versions.tf | 2 +- modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 38 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 3060bfbb26..106be3bdf8 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.37.2", + Version: "v1.38.0", Annotations: annotation, } ) diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 66bf606a9c..4e4e500f30 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.38.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index abad3f3258..2690b53cb0 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.38.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 5e1e5dfb58..e35c55bf3b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.38.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index eaff029d6f..e4947c1420 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.38.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 715e688b91..cc616cd258 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.38.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 82577362ea..783ba8e39a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.38.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 65df8a47a5..38330af5d0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.38.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index f0044bae59..f0ea4295ce 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.38.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 647d481d84..37480f7cb9 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index f65554b99f..d649bf0ea0 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.38.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 884a0392f5..0d08aa7deb 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.38.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index e0cfdd2c90..d3e1124ef4 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index 69a428de51..3569a93f37 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index f88bf81d4e..c32a7e9ca6 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.38.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index 2c7af3661c..d59dc83874 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 94ef21e400..e9c1a1d319 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.38.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 3e772ca894..4473fa1c46 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.38.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index f194c92523..60bdd4f8ac 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.38.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 7e1c94c47b..254362717d 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.38.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 3d833fcefb..e3513e58be 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.38.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 03e05d22ab..3c5bb6bf5d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.38.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 23acd9c1d9..7ab1c46f14 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -30,6 +30,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.38.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 6f4be07223..59e73842cf 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.38.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index ddc56c2ff4..be6f5e82c9 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.38.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 20fa9b69fc..e4e02e4151 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 2a670f5fa0..8905a94eea 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.38.0" } } diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 043771dc38..1b46a4e5e1 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 2252e019ef..c85733d7e1 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index 88a415b5a5..adb28ea217 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.38.0" } } diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index c1bb467e0c..5f97cdab1b 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index ac5639e517..485548fdc3 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.38.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index d96cb9c70c..d3524d92f0 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 411db01183..9d9a57638f 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index c441a2017d..d7a9b6cb1b 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.38.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 56db91d4db..6b313dc61b 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.38.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 827bfc63a9..10feac6ae6 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.38.0" } required_version = ">= 1.0.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 23417dc1de..2c31eb6231 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.37.2" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.38.0" } required_version = ">= 1.3" From 2127e3ed936f6622c611664ed8567e754c2a57db Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 8 Aug 2024 19:22:41 +0000 Subject: [PATCH 114/118] Fix bug of suppliying different `instance_properties` --- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/main.tf | 8 ++++-- .../partition.tf | 28 +++++++++---------- .../variables.tf | 14 +++++----- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 099920990a..491ea64419 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -97,7 +97,7 @@ locals { termination_action = try(var.spot_instance_config.termination_action, null) reservation_name = local.reservation_name maintenance_interval = var.maintenance_interval - instance_properties = var.instance_properties + instance_properties_json = jsonencode(var.instance_properties) zone_target_shape = var.zone_target_shape zone_policy_allow = local.zones diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 96c9f41272..dc16e79894 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -273,7 +273,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties = optional(any, null)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index 7e7f39fa0f..c25748dc48 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -67,8 +67,12 @@ locals { epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - partitions = { for p in var.partitions : p.partition_name => p } - nodeset = { for n in var.nodeset : n.nodeset_name => n } + partitions = { for p in var.partitions : p.partition_name => p } + nodeset = { + for n in var.nodeset : n.nodeset_name => merge(n, { + instance_properties = jsondecode(n.instance_properties_json) + }) + } nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index d2d85db1fd..d1f783f64a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -68,20 +68,20 @@ module "slurm_nodeset_template" { locals { nodesets = [for name, ns in local.nodeset_map : { - nodeset_name = ns.nodeset_name - node_conf = ns.node_conf - instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link - node_count_dynamic_max = ns.node_count_dynamic_max - node_count_static = ns.node_count_static - subnetwork = ns.subnetwork_self_link - reservation_name = ns.reservation_name - maintenance_interval = ns.maintenance_interval - instance_properties = ns.instance_properties - enable_placement = ns.enable_placement - network_storage = ns.network_storage - zone_target_shape = ns.zone_target_shape - zone_policy_allow = ns.zone_policy_allow - zone_policy_deny = ns.zone_policy_deny + nodeset_name = ns.nodeset_name + node_conf = ns.node_conf + instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link + node_count_dynamic_max = ns.node_count_dynamic_max + node_count_static = ns.node_count_static + subnetwork = ns.subnetwork_self_link + reservation_name = ns.reservation_name + maintenance_interval = ns.maintenance_interval + instance_properties_json = ns.instance_properties_json + enable_placement = ns.enable_placement + network_storage = ns.network_storage + zone_target_shape = ns.zone_target_shape + zone_policy_allow = ns.zone_policy_allow + zone_policy_deny = ns.zone_policy_deny }] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 44ed33f994..ecdb4b22c3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -211,13 +211,13 @@ variable "nodeset" { count = number type = string })) - labels = optional(map(string), {}) - machine_type = optional(string) - maintenance_interval = optional(string) - instance_properties = optional(any, null) - metadata = optional(map(string), {}) - min_cpu_platform = optional(string) - network_tier = optional(string, "STANDARD") + labels = optional(map(string), {}) + machine_type = optional(string) + maintenance_interval = optional(string) + instance_properties_json = string + metadata = optional(map(string), {}) + min_cpu_platform = optional(string) + network_tier = optional(string, "STANDARD") network_storage = optional(list(object({ server_ip = string remote_mount = string From 9b72392382308611c7cd99184a0f56af469df41f Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 13 Aug 2024 19:24:01 -0700 Subject: [PATCH 115/118] Merge pull request #2885 from GoogleCloudPlatform/dependabot/pip/community/front-end/ofe/django-4.2.15 Bump django from 4.2.11 to 4.2.15 in /community/front-end/ofe --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index a1db6ad4b0..f2764643c5 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -19,7 +19,7 @@ dill==0.3.6 distlib==0.3.6 # django-revproxy==0.11.0 released but not yet in pypi git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787 -Django==4.2.14 +Django==4.2.15 django-allauth==0.54.0 django-extensions==3.2.3 djangorestframework==3.15.2 From feca2052c8b14517482a3bb949e2cf6e70c6c1d4 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 14 Aug 2024 09:48:26 -0700 Subject: [PATCH 116/118] Update `grpcio` to 1.56.2 and `requests` to 2.32.0 --- .../modules/slurm_files/scripts/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt index 3783cd6977..2e8611314b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt @@ -10,9 +10,9 @@ google-cloud-storage==2.10.0 google-cloud-tpu==1.10.0 google-resumable-media==2.5.0 googleapis-common-protos==1.59.1 -grpcio==1.56.0 +grpcio==1.56.2 grpcio-status==1.56.0 httplib2==0.22.0 more-executors==2.11.4 pyyaml==6.0 -requests==2.31.0 +requests==2.32.0 From fa6eb37306e6b46c987cd11773aeba52194b13db Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 15 Aug 2024 11:33:13 -0700 Subject: [PATCH 117/118] Roll back changes in go.mod to release v1.37.2 --- go.mod | 33 ++++++++++++++-------------- go.sum | 68 +++++++++++++++++++++++++++++----------------------------- 2 files changed, 51 insertions(+), 50 deletions(-) diff --git a/go.mod b/go.mod index 1c67acf666..7a7435fb98 100644 --- a/go.mod +++ b/go.mod @@ -13,9 +13,9 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 - github.com/zclconf/go-cty v1.15.0 + github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf // indirect + google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,20 +27,20 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 - google.golang.org/api v0.190.0 + google.golang.org/api v0.186.0 ) require ( - cloud.google.com/go/auth v0.7.3 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect + cloud.google.com/go/auth v0.6.0 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/googleapis/gax-go/v2 v2.13.0 // indirect + github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/hashicorp/terraform-json v0.22.1 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -54,14 +54,14 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 // indirect ) require ( cloud.google.com/go v0.115.0 // indirect - cloud.google.com/go/compute/metadata v0.5.0 // indirect - cloud.google.com/go/iam v1.1.12 // indirect + cloud.google.com/go/compute/metadata v0.3.0 // indirect + cloud.google.com/go/iam v1.1.8 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect github.com/agext/levenshtein v1.2.3 @@ -71,7 +71,8 @@ require ( github.com/emirpasic/gods v1.18.1 // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect - github.com/google/s2a-go v0.1.8 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/google/s2a-go v0.1.7 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect @@ -94,12 +95,12 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.25.0 // indirect - golang.org/x/net v0.27.0 // indirect + golang.org/x/crypto v0.24.0 // indirect + golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.23.0 + golang.org/x/sys v0.21.0 golang.org/x/text v0.16.0 // indirect - google.golang.org/grpc v1.64.1 // indirect + google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index 3c5627ba31..0c9ffbf1dc 100644 --- a/go.sum +++ b/go.sum @@ -46,10 +46,10 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.7.3 h1:98Vr+5jMaCZ5NZk6e/uBgf60phTk/XN84r8QEWB9yjY= -cloud.google.com/go/auth v0.7.3/go.mod h1:HJtWUx1P5eqjy/f6Iq5KeytNpbAcGolPhOgyop2LlzA= -cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= -cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= +cloud.google.com/go/auth v0.6.0 h1:5x+d6b5zdezZ7gmLWD1m/xNjnaQ2YDhmIz/HH3doy1g= +cloud.google.com/go/auth v0.6.0/go.mod h1:b4acV+jLQDyjwm4OXHYjNvRi4jvGBzHWJRtJcy+2P4g= +cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= +cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/automl v1.6.0/go.mod h1:ugf8a6Fx+zP0D59WLhqgTDsQI9w07o64uf/Is3Nh5p8= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= @@ -72,8 +72,8 @@ cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= -cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= -cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= +cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= +cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/containeranalysis v0.5.1/go.mod h1:1D92jd8gRR/c0fGMlymRgxWD3Qw9C1ff6/T7mLgVL8I= cloud.google.com/go/containeranalysis v0.6.0/go.mod h1:HEJoiEIu+lEXM+k7+qLCci0h33lX3ZqoYFdmPcoO7s4= cloud.google.com/go/datacatalog v1.3.0/go.mod h1:g9svFY6tuR+j+hrTw3J2dNcmI0dzmSiyOzm8kpLq0a0= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.12 h1:JixGLimRrNGcxvJEQ8+clfLxPlbeZA6MuRJ+qJNQ5Xw= -cloud.google.com/go/iam v1.1.12/go.mod h1:9LDX8J7dN5YRyzVHxwQzrQs9opFFqn0Mxs9nAeB+Hhg= +cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= +cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -271,8 +271,8 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-test/deep v1.0.3 h1:ZrJSEWsXzPOxaZnFteGEfooLba+ju3FYIbOrS+rQd68= @@ -353,8 +353,8 @@ github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= -github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= -github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= +github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= +github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -373,8 +373,8 @@ github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99 github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= -github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= -github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A= +github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= +github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= @@ -496,8 +496,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zclconf/go-cty v1.15.0 h1:tTCRWxsexYUmtt/wVxgDClUe+uQusuI443uL6e+5sXQ= -github.com/zclconf/go-cty v1.15.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.14.4 h1:uXXczd9QDGsgu0i/QFR/hzI5NYCHLf6NQw/atrbnhq8= +github.com/zclconf/go-cty v1.14.4/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= @@ -529,8 +529,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= -golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= +golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -619,8 +619,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= -golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -732,13 +732,13 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= -golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= -golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -868,8 +868,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.190.0 h1:ASM+IhLY1zljNdLu19W1jTmU6A+gMk6M46Wlur61s+Q= -google.golang.org/api v0.190.0/go.mod h1:QIr6I9iedBLnfqoD6L6Vze1UvS5Hzj5r2aUBOaZnLHo= +google.golang.org/api v0.186.0 h1:n2OPp+PPXX0Axh4GuSsL5QL8xQCTb2oDwyzPnQvqUug= +google.golang.org/api v0.186.0/go.mod h1:hvRbBmgoje49RV3xqVXrmP6w93n6ehGgIVPYrGtBFFc= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -978,12 +978,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf h1:OqdXDEakZCVtDiZTjcxfwbHPCT11ycCEsTKesBVKvyY= -google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:mCr1K1c8kX+1iSBREvU3Juo11CB+QOEWxbRS01wWl5M= -google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f h1:b1Ln/PG8orm0SsBbHZWke8dDp2lrCD4jSmfglFpTZbk= -google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:AHT0dDg3SoMOgZGnZk29b5xTbPHMoEC8qthmBLJCpys= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf h1:liao9UHurZLtiEwBgT9LMOnKYsHze6eA6w1KQCMVN2Q= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 h1:CUiCqkPw1nNrNQzCCG4WA65m0nAmQiwXHpub3dNyruU= +google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4/go.mod h1:EvuUDCulqGgV80RvP1BHuom+smhX4qtlhnNatHuroGQ= +google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 h1:QW9+G6Fir4VcRXVH8x3LilNAb6cxBGLa6+GM4hRwexE= +google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3/go.mod h1:kdrSS/OiLkPrNUpzD4aHgCq2rVuC/YRxok32HXZ4vRE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 h1:Di6ANFilr+S60a4S61ZM00vLdw0IrQOSMS2/6mrnOU0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1019,8 +1019,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= -google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From da0a13322c1a44c02f41725b2e6557fed82c1c0c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 15 Aug 2024 14:09:41 -0500 Subject: [PATCH 118/118] Update permissions for local SSD mounts to allow non-superuser write access --- community/examples/hpc-slurm-local-ssd-v5-legacy.yaml | 2 +- community/examples/hpc-slurm-local-ssd.yaml | 2 +- .../machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml | 2 +- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml b/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml index a10116544d..08e39819b7 100644 --- a/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml +++ b/community/examples/hpc-slurm-local-ssd-v5-legacy.yaml @@ -48,7 +48,7 @@ deployment_groups: local_ssd_filesystem: fs_type: ext4 mountpoint: /mnt/localssd - permissions: "0755" # must quote numeric filesystem permissions! + permissions: "1777" # must quote numeric filesystem permissions! - id: compute_node_group source: community/modules/compute/schedmd-slurm-gcp-v5-node-group diff --git a/community/examples/hpc-slurm-local-ssd.yaml b/community/examples/hpc-slurm-local-ssd.yaml index 9921a5c621..c41c332942 100644 --- a/community/examples/hpc-slurm-local-ssd.yaml +++ b/community/examples/hpc-slurm-local-ssd.yaml @@ -48,7 +48,7 @@ deployment_groups: local_ssd_filesystem: fs_type: ext4 mountpoint: /mnt/localssd - permissions: "0755" # must quote numeric filesystem permissions! + permissions: "1777" # must quote numeric filesystem permissions! - id: nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml index 364b66f10f..e252773c98 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -115,7 +115,7 @@ deployment_groups: # mount local SSD filesystems local_ssd_filesystem: mountpoint: /mnt/localssd - permissions: 1777 + permissions: "1777" # must quote numeric filesystem permissions! runners: - type: ansible-local destination: enable_nvidia_dcgm.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 5dae64cb67..5fcd60f353 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -87,7 +87,7 @@ deployment_groups: # mount local SSD filesystems local_ssd_filesystem: mountpoint: /mnt/localssd - permissions: "0755" # must quote numeric filesystem permissions! + permissions: "1777" # must quote numeric filesystem permissions! runners: - type: shell destination: setup_aperture.sh