Skip to content

Commit

Permalink
Merge pull request GoogleCloudPlatform#1939 from GoogleCloudPlatform/…
Browse files Browse the repository at this point in the history
…release-candidate

Release v1.25.0
  • Loading branch information
mr0re1 authored Nov 7, 2023
2 parents e64f027 + a5ceeae commit 3abddcf
Show file tree
Hide file tree
Showing 181 changed files with 6,226 additions and 908 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# Built Binary
ghpc
# Expand artifact
expanded.yaml
# macOS Desktop Services Store
.DS_Store
# workspace level vscode settings
Expand Down
18 changes: 15 additions & 3 deletions cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func printAdvancedInstructionsMessage(deplDir string) {
func expandOrDie(path string) config.DeploymentConfig {
dc, ctx, err := config.NewDeploymentConfig(path)
if err != nil {
log.Fatal(err)
log.Fatal(renderError(err, ctx))
}
// Set properties from CLI
if err := setCLIVariables(&dc.Config, cliVariables); err != nil {
Expand Down Expand Up @@ -193,11 +193,23 @@ func renderError(err error, ctx config.YamlCtx) string {
}

func renderRichError(err error, pos config.Pos, ctx config.YamlCtx) string {
line := pos.Line - 1
if line < 0 {
line = 0
}
if line >= len(ctx.Lines) {
line = len(ctx.Lines) - 1
}

pref := fmt.Sprintf("%d: ", pos.Line)
arrow := strings.Repeat(" ", len(pref)+pos.Column-1) + "^"
arrow := " "
if pos.Column > 0 {
spaces := strings.Repeat(" ", len(pref)+pos.Column-1)
arrow = spaces + "^"
}
return fmt.Sprintf(`Error: %s
%s%s
%s`, err, pref, ctx.Lines[pos.Line-1], arrow)
%s`, err, pref, ctx.Lines[line], arrow)
}

func setCLIVariables(bp *config.Blueprint, s []string) error {
Expand Down
9 changes: 5 additions & 4 deletions cmd/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,22 +137,22 @@ func (s *MySuite) TestRenderError(c *C) {
c.Check(got, Equals, "arbuz")
}
{ // has pos, but context doesn't contain it
ctx := config.NewYamlCtx([]byte(``))
ctx, _ := config.NewYamlCtx([]byte(``))
pth := config.Root.Vars.Dot("kale")
err := config.BpError{Path: pth, Err: errors.New("arbuz")}
got := renderError(err, ctx)
c.Check(got, Equals, "arbuz")
}
{ // has pos, has context
ctx := config.NewYamlCtx([]byte(`
ctx, _ := config.NewYamlCtx([]byte(`
vars:
kale: dos`))
pth := config.Root.Vars.Dot("kale")
err := config.BpError{Path: pth, Err: errors.New("arbuz")}
got := renderError(err, ctx)
c.Check(got, Equals, `Error: arbuz
3: kale: dos
^`)
^`)
}
}

Expand All @@ -161,5 +161,6 @@ func (s *MySuite) TestValidateMaybeDie(c *C) {
Validators: []config.Validator{{Validator: "invalid"}},
ValidationLevel: config.ValidationWarning,
}
validateMaybeDie(bp, config.NewYamlCtx([]byte{})) // smoke test
ctx, _ := config.NewYamlCtx([]byte{})
validateMaybeDie(bp, ctx) // smoke test
}
31 changes: 24 additions & 7 deletions cmd/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@ func runDeployCmd(cmd *cobra.Command, args []string) {
expandedBlueprintFile := filepath.Join(artifactsDir, expandedBlueprintFilename)
dc, _, err := config.NewDeploymentConfig(expandedBlueprintFile)
cobra.CheckErr(err)
cobra.CheckErr(shell.ValidateDeploymentDirectory(dc.Config.DeploymentGroups, deploymentRoot))
groups := dc.Config.DeploymentGroups
cobra.CheckErr(validateRuntimeDependencies(groups))
cobra.CheckErr(shell.ValidateDeploymentDirectory(groups, deploymentRoot))

for _, group := range dc.Config.DeploymentGroups {
for _, group := range groups {
groupDir := filepath.Join(deploymentRoot, string(group.Name))
cobra.CheckErr(shell.ImportInputs(groupDir, artifactsDir, expandedBlueprintFile))

Expand All @@ -102,6 +104,25 @@ func runDeployCmd(cmd *cobra.Command, args []string) {
printAdvancedInstructionsMessage(deploymentRoot)
}

func validateRuntimeDependencies(groups []config.DeploymentGroup) error {
for _, group := range groups {
var err error
switch group.Kind() {
case config.PackerKind:
err = shell.ConfigurePacker()
case config.TerraformKind:
groupDir := filepath.Join(deploymentRoot, string(group.Name))
_, err = shell.ConfigureTerraform(groupDir)
default:
err = fmt.Errorf("group %s is an unsupported kind %q", group.Name, group.Kind().String())
}
if err != nil {
return err
}
}
return nil
}

func deployPackerGroup(moduleDir string) error {
if err := shell.ConfigurePacker(); err != nil {
return err
Expand Down Expand Up @@ -133,9 +154,5 @@ func deployTerraformGroup(groupDir string) error {
if err != nil {
return err
}

if err = shell.ExportOutputs(tf, artifactsDir, applyBehavior); err != nil {
return err
}
return nil
return shell.ExportOutputs(tf, artifactsDir, applyBehavior)
}
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ HPC deployments on the Google Cloud Platform.`,
log.Fatalf("cmd.Help function failed: %s", err)
}
},
Version: "v1.24.0",
Version: "v1.25.0",
Annotations: annotation,
}
)
Expand Down
82 changes: 82 additions & 0 deletions community/examples/hpc-slurm6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

blueprint_name: hpc-slurm6

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: slurm-gcp-v6
region: us-west4
zone: us-west4-c
instance_image:
family: slurm-gcp-6-1-hpc-rocky-linux-8
project: schedmd-slurm-public

deployment_groups:
- group: primary
modules:
- id: network
source: modules/network/vpc

- id: debug_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: ns1
node_count_dynamic_max: 4
machine_type: n2-standard-2
enable_placement: false # the default is: true

- id: debug_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- debug_nodeset
settings:
partition_name: debug
exclusive: false # allows nodes to stay up after jobs are done
is_default: true

- id: compute_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: ns2
node_count_dynamic_max: 20

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- compute_nodeset
settings:
partition_name: compute

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
group_name: login
machine_type: n2-standard-4
disable_login_public_ips: false

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- debug_partition
- compute_partition
- slurm_login
settings:
disable_controller_public_ips: false
2 changes: 1 addition & 1 deletion community/front-end/ofe/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ tomlkit==0.11.8
typing-inspect==0.9.0
typing_extensions==4.6.3
uritemplate==4.1.1
urllib3==1.26.17
urllib3==1.26.18
uvicorn==0.22.0
virtualenv==20.23.1
wrapt==1.15.0
Expand Down
2 changes: 1 addition & 1 deletion community/modules/compute/gke-node-pool/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.24.0"
module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.25.0"
}
}
2 changes: 1 addition & 1 deletion community/modules/compute/htcondor-execute-point/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ limitations under the License.
|------|--------|---------|
| <a name="module_execute_point_instance_template"></a> [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 |
| <a name="module_mig"></a> [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 |
| <a name="module_startup_script"></a> [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1&depth=1 |
| <a name="module_startup_script"></a> [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 |
## Resources
Expand Down
2 changes: 1 addition & 1 deletion community/modules/compute/htcondor-execute-point/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ resource "google_storage_bucket_object" "execute_config" {
}

module "startup_script" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1&depth=1"
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4"

project_id = var.project_id
region = var.region
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ terraform {
}

provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.24.0"
module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.25.0"
}
}
2 changes: 1 addition & 1 deletion community/modules/compute/pbspro-execution/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ No providers.
| Name | Source | Version |
|------|--------|---------|
| <a name="module_execution_startup_script"></a> [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.22.1 |
| <a name="module_execution_startup_script"></a> [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | 336e0a4 |
| <a name="module_pbs_execution"></a> [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.22.1 |
| <a name="module_pbs_install"></a> [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.22.1 |
Expand Down
2 changes: 1 addition & 1 deletion community/modules/compute/pbspro-execution/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ module "pbs_install" {
}

module "execution_startup_script" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.22.1"
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=336e0a4"

deployment_name = var.deployment_name
project_id = var.project_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.24.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.25.0"
}
required_version = ">= 1.1"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.24.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.25.0"
}
required_version = ">= 0.13.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
## Requirements

| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |

## Providers

No providers.

## Modules

No modules.

## Resources

No resources.

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_accelerator_config"></a> [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. | <pre>object({<br> topology = string<br> version = string<br> })</pre> | <pre>{<br> "topology": "",<br> "version": ""<br>}</pre> | no |
| <a name="input_data_disks"></a> [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no |
| <a name="input_disable_public_ips"></a> [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no |
| <a name="input_docker_image"></a> [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-1-tf-<var.tf\_version> | `string` | `null` | no |
| <a name="input_name"></a> [name](#input\_name) | Name of the nodeset tpu. | `string` | `"ghpc"` | no |
| <a name="input_node_count_dynamic_max"></a> [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of dynamic nodes allowed in this partition. | `number` | `1` | no |
| <a name="input_node_count_static"></a> [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no |
| <a name="input_node_type"></a> [node\_type](#input\_node\_type) | Specify a node type to base the vm configuration upon it. | `string` | n/a | yes |
| <a name="input_preemptible"></a> [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no |
| <a name="input_preserve_tpu"></a> [preserve\_tpu](#input\_preserve\_tpu) | Specify whether TPU-vms will get preserve on suspend, if set to true, on suspend vm is stopped, on false it gets deleted | `bool` | `true` | no |
| <a name="input_service_account"></a> [service\_account](#input\_service\_account) | Service account to attach to the TPU-vm. If none is given, the default service account and scopes will be used. | <pre>object({<br> email = string<br> scopes = set(string)<br> })</pre> | `null` | no |
| <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The name of the subnetwork to attach the TPU-vm of this nodeset to. | `string` | `null` | no |
| <a name="input_tf_version"></a> [tf\_version](#input\_tf\_version) | Nodeset Tensorflow version, see https://cloud.google.com/tpu/docs/supported-tpu-configurations#tpu_vm for details. | `string` | `"2.9.1"` | no |
| <a name="input_zone"></a> [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes |

## Outputs

| Name | Description |
|------|-------------|
| <a name="output_nodeset_tpu"></a> [nodeset\_tpu](#output\_nodeset\_tpu) | Details of the nodeset tpu. Typically used as input to `schedmd-slurm-gcp-v6-partition`. |
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
41 changes: 41 additions & 0 deletions community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# locals {
# # This label allows for billing report tracking based on module.
# labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v6-nodeset", ghpc_role = "compute" })
# }

locals {

nodeset_tpu = {
node_count_static = var.node_count_static
node_count_dynamic_max = var.node_count_dynamic_max
nodeset_name = var.name
node_type = var.node_type

accelerator_config = var.accelerator_config
tf_version = var.tf_version
preemptible = var.preemptible
preserve_tpu = var.preserve_tpu

data_disks = var.data_disks
docker_image = var.docker_image

enable_public_ip = !var.disable_public_ips
subnetwork = var.subnetwork_self_link
service_account = var.service_account
zone = var.zone
}
}
Loading

0 comments on commit 3abddcf

Please sign in to comment.